Merge pull request Azure#3 from nagkumar91/task/mandatory_attack_objective

nagkumar91 · web-flow · commit f92c4f063299 · 2025-03-11T09:32:16.000-07:00
Make attack objective generator mandatory
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py
@@ -80,6 +80,7 @@ class ErrorTarget(Enum):
     MODELS = "Models"
     UNKNOWN = "Unknown"
     CONVERSATION = "Conversation"
+    RED_TEAM_AGENT = "RedTeamAgent"
 
 
 class EvaluationException(AzureError):
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/__init__.py
@@ -23,10 +23,8 @@ class AttackObjectiveGenerator:
     :param risk_categories: List of risk categories to generate attack objectives for
     :type risk_categories: List[RiskCategory]
     """
-
-    # TODO num objectives here to replace num_rows in red team agent
-    # Q: Should this be total or per category?
-    def __init__(self, risk_categories):
+    def __init__(self, risk_categories: list[RiskCategory], num_objectives: int = 10):
         self.risk_categories = risk_categories
+        self.num_objectives = num_objectives
 
 __all__ = ["RedTeamAgent", "AttackStrategy", "RiskCategory", "AttackObjectiveGenerator"]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/_red_team_agent.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_safety_evaluation/_red_team_agent.py
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/autogen/raiclient/operations/_operations.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/autogen/raiclient/operations/_operations.py
@@ -621,7 +621,6 @@ def get_attack_objectives(
             "workspaceName": self._serialize.url("self._config.workspace_name", self._config.workspace_name, "str"),
         }
         _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
         _stream = kwargs.pop("stream", False)
         pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
             _request, stream=_stream, **kwargs
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py
@@ -72,24 +72,23 @@ def _get_service_discovery_url(self):
         base_url = urlparse(response.json()["properties"]["discoveryUrl"])
         return f"{base_url.scheme}://{base_url.netloc}"
     
-    async def get_attack_objectives(self, risk_categories: List[str], application_scenario: str = None) -> Dict:
+    async def get_attack_objectives(self, risk_categories: Optional[List[str]] = None, application_scenario: str = None, strategy: Optional[str] = None) -> Dict:
         """Get attack objectives using the auto-generated operations.
         
-        :param risk_categories: List of risk categories to generate attack objectives for
-        :type risk_categories: List[str]
+        :param risk_categories: Optional list of risk categories to generate attack objectives for
+        :type risk_categories: Optional[List[str]]
         :param application_scenario: Optional description of the application scenario for context
         :type application_scenario: str
         :return: The attack objectives
         :rtype: Dict
-        """
-        
+        """ 
+        risk_categories = risk_categories or []
         try:
             # Send the request using the autogenerated client
             response = self._client.rai_svc.get_attack_objectives(
-                risk_types=risk_categories,
+                risk_types=[], # TODO: fix the filtering on this level
                 lang="en"
             )
-            # TODO figure out how to process this string output properly
             return response
             
         except Exception as e:
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_rai_client.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_model_tools/_rai_client.py
@@ -222,6 +222,9 @@ async def get_attack_objectives(self, risk_categories: List[str], application_sc
         :rtype: Any
         """
         # Create query parameters for the request
+        if application_scenario:
+            raise NotImplementedError("Application scenario is not supported yet")
+        
         params = {
             "api-version": "2022-11-01-preview",
             "riskTypes": ",".join(risk_categories),
@@ -235,6 +238,18 @@ async def get_attack_objectives(self, risk_categories: List[str], application_sc
         try:
             # Make the request using the existing get method
             result = await self.get(self.attack_objectives_endpoint)
+            # from collections import defaultdict
+            # counts_by_risk = defaultdict(int)
+            # for item in result:
+            #     target_harms = item.get("Metadata", {}).get("TargetHarms", [])
+            #     if not target_harms:
+            #         # No risk type specified
+            #         counts_by_risk["empty"] += 1
+            #     else:
+            #         for harm in target_harms:
+            #             # Use "empty" if the risk type field is missing
+            #             risk_type = harm.get("RiskType", "") or "empty"
+            #             counts_by_risk[risk_type] += 1
             return result
         except Exception:
             # If the API fails or isn't implemented yet, return a mock response
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/pyrit_sim.py b/sdk/evaluation/azure-ai-evaluation/samples/pyrit_sim.py
@@ -5,15 +5,13 @@
 pip install -e ".[pyrit]"
 """
 
-
 from typing import Dict, List, Optional
-from azure.ai.evaluation._safety_evaluation._red_team_agent import RedTeamAgent, AttackStrategy
+from azure.ai.evaluation._safety_evaluation import RedTeamAgent, AttackStrategy, AttackObjectiveGenerator, RiskCategory
 import os
 from azure.identity import DefaultAzureCredential
 from azure.ai.evaluation.simulator import AdversarialScenario
 from pyrit.prompt_target import OpenAIChatTarget
 from pyrit.common import initialize_pyrit, DUCK_DB
-from azure.ai.evaluation._safety_evaluation import AttackObjectiveGenerator, RiskCategory
 
 
 async def main():
@@ -23,35 +21,55 @@ async def main():
         "project_name": os.environ.get("AZURE_PROJECT_NAME"),
     }
 
-
     # [START red_team_agent_targets]
     # Model config target 
     model_config = {
         "azure_endpoint": os.environ.get("AZURE_ENDPOINT"),
         "azure_deployment": os.environ.get("AZURE_DEPLOYMENT_NAME"),
     }
 
+    ## Minimal inputs
+    attack_objective_generator = AttackObjectiveGenerator(
+        risk_categories=[
+            RiskCategory.HateUnfairness,
+        ],
+        num_objectives=10,
+    )
+
     red_team_agent = RedTeamAgent(
         azure_ai_project=azure_ai_project,
         credential=DefaultAzureCredential(),
     )
 
     outputs = await red_team_agent.attack(
         target=model_config, # type: ignore
+        attack_objective_generator=attack_objective_generator,
     )
     print(outputs)
 
     # Mock function target to simulate an AI application
     def call_to_ai_application(query: str) -> str:
         return "mock response"
     
+    ## Maximal inputs
+    attack_objective_generator = AttackObjectiveGenerator(
+        risk_categories=[
+            RiskCategory.HateUnfairness,
+            RiskCategory.Violence,
+            RiskCategory.Sexual,
+            RiskCategory.SelfHarm,
+        ],
+        num_objectives=10,
+    )
+
     red_team_agent = RedTeamAgent(
         azure_ai_project=azure_ai_project,
         credential=DefaultAzureCredential(),
     )
 
     outputs = await red_team_agent.attack(
         target=call_to_ai_application, # type: ignore
+        attack_objective_generator=attack_objective_generator,
     )
     print(outputs)
     
@@ -62,7 +80,7 @@ def callback_target(
         session_state: Optional[str] = None,
         context: Optional[Dict] = None
     ) -> dict:
-        messages_list = [{"role": chat_message.role,"content": chat_message.content,} for chat_message in messages] #type: ignore
+        messages_list = [{"role": chat_message.role,"content": chat_message.content} for chat_message in messages] #type: ignore
         latest_message = messages_list[-1]
         application_input = latest_message["content"]
         try:
@@ -86,14 +104,15 @@ def callback_target(
 
     outputs = await red_team_agent.attack(
         target=callback_target, # type: ignore
+        attack_objective_generator=attack_objective_generator,
     )
     print(outputs)
 
     # Pyrit target
     initialize_pyrit(memory_db_type=DUCK_DB)
     pyrit_target = OpenAIChatTarget(
-        deployment_name = os.environ.get("AZURE_DEPLOYMENT_NAME"),
-        endpoint = os.environ.get("AZURE_ENDPOINT"),
+        deployment_name=os.environ.get("AZURE_DEPLOYMENT_NAME"),
+        endpoint=os.environ.get("AZURE_ENDPOINT"),
         use_aad_auth=True
     )
 
@@ -104,6 +123,7 @@ def callback_target(
 
     outputs = await red_team_agent.attack(
         target=pyrit_target, # type: ignore
+        attack_objective_generator=attack_objective_generator,
     )
     print(outputs)
     # [END red_team_agent_targets]
@@ -117,42 +137,46 @@ def callback_target(
 
     outputs = await red_team_agent.attack(
         target=call_to_ai_application, # type: ignore
+        attack_objective_generator=attack_objective_generator,
     )
     print(outputs)
 
-    # Low budget
+    # EASY budget
     red_team_agent = RedTeamAgent(
         azure_ai_project=azure_ai_project,
         credential=DefaultAzureCredential(),
     )
 
     outputs = await red_team_agent.attack(
         target=call_to_ai_application, # type: ignore
-        attack_strategy=[AttackStrategy.LOW]
+        attack_strategy=[AttackStrategy.EASY],
+        attack_objective_generator=attack_objective_generator,
     )
     print(outputs)
 
-    # Medium budget
+    # MODERATE budget
     red_team_agent = RedTeamAgent(
         azure_ai_project=azure_ai_project,
         credential=DefaultAzureCredential(),
     )
     
     outputs = await red_team_agent.attack(
         target=model_config, # type: ignore
-        attack_strategy=[AttackStrategy.MEDIUM]
+        attack_strategy=[AttackStrategy.MODERATE],
+        attack_objective_generator=attack_objective_generator,
     )
     print(outputs)
 
-    # High budget
+    # DIFFICULT budget
     red_team_agent = RedTeamAgent(
         azure_ai_project=azure_ai_project,
         credential=DefaultAzureCredential(),
     )
 
     outputs = await red_team_agent.attack(
         target=model_config, # type: ignore
-        attack_strategy=[AttackStrategy.HIGH]
+        attack_strategy=[AttackStrategy.DIFFICULT],
+        attack_objective_generator=attack_objective_generator,
     )
 
     # Compose attack strategies
@@ -164,8 +188,10 @@ def callback_target(
     outputs = await red_team_agent.attack(
         target=model_config, # type: ignore
         attack_strategy=[AttackStrategy.Compose([AttackStrategy.Flip, AttackStrategy.Base64]), 
-            AttackStrategy.LOW,
+            AttackStrategy.EASY,
             AttackStrategy.Morse],
+        output_path="RacoonRedTeamEvalResults.jsonl", 
+        attack_objective_generator=attack_objective_generator,
     )
     print(outputs)
 
@@ -177,19 +203,22 @@ def callback_target(
 
     outputs = await red_team_agent.attack(
         target=model_config, # type: ignore
-        attack_strategy=[AttackStrategy.HIGH, AttackStrategy.Compose([AttackStrategy.Math, AttackStrategy.Tense])],
+        attack_strategy=[AttackStrategy.DIFFICULT, AttackStrategy.Compose([AttackStrategy.Math, AttackStrategy.Tense])],
+        output_path="DIFFICULTBudget-Duplicates.jsonl",
+        attack_objective_generator=attack_objective_generator,
     )
     print(outputs)
 
-    # Low, medium, and high
+    # EASY, MODERATE, and DIFFICULT
     red_team_agent = RedTeamAgent(
         azure_ai_project=azure_ai_project,
         credential=DefaultAzureCredential(),
     )
 
     outputs = await red_team_agent.attack(
         target=call_to_ai_application, # type: ignore
-        attack_strategy=[AttackStrategy.LOW, AttackStrategy.MEDIUM, AttackStrategy.HIGH]
+        attack_strategy=[AttackStrategy.EASY, AttackStrategy.MODERATE, AttackStrategy.DIFFICULT],
+        attack_objective_generator=attack_objective_generator,
     )
     print(outputs)
 
@@ -200,7 +229,8 @@ def callback_target(
         risk_categories=[
             RiskCategory.HateUnfairness, 
             RiskCategory.Violence,
-        ]
+        ],
+        num_objectives=10,
     )
     application_scenario = "You are an AI application fine tuned on financial services data, a chatbot which answers questions about how best to use the company's product for a user's financial needs. Industry = Financial Services"
 

Original file line number	Diff line number	Diff line change
`@@ -621,7 +621,6 @@ def get_attack_objectives(`
`621`	`621`	`"workspaceName": self._serialize.url("self._config.workspace_name", self._config.workspace_name, "str"),`
`622`	`622`	`}`
`623`	`623`	`_request.url = self._client.format_url(_request.url, **path_format_arguments)`
`624`		`-`
`625`	`624`	`_stream = kwargs.pop("stream", False)`
`626`	`625`	`pipeline_response: PipelineResponse = self._client._pipeline.run( # pylint: disable=protected-access`
`627`	`626`	`_request, stream=_stream, **kwargs`