l0lawrence
diff --git a/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
+1-1 b/‎sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
+1-1
diff --git a/‎sdk/evaluation/azure-ai-evaluation/assets.json
+1-1 b/‎sdk/evaluation/azure-ai-evaluation/assets.json
+1-1
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
+14 b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
+14
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py
+116-2 b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py
+116-2
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
+99-2 b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
+99-2
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py
+38 b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py
+38
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py
+4-4 b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py
+4-4
@@ -1,6 +1,5 @@
 # Release History
 
-
 ## 1.0.0b5 (Unreleased)
 
 ### Features Added
@@ -23,6 +22,7 @@ outputs = asyncio.run(custom_simulator(
     max_conversation_turns=1,
 ))
 ```
+- Adding evaluator for multimodal use cases
 
 ### Breaking Changes
 - Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.
 
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_f0444ef220"
+  "Tag": "python/evaluation/azure-ai-evaluation_eb4989f81d"
 }
@@ -12,6 +12,14 @@
     SexualEvaluator,
     ViolenceEvaluator,
 )
+from ._evaluators._multimodal._content_safety_multimodal import (
+    ContentSafetyMultimodalEvaluator,
+    HateUnfairnessMultimodalEvaluator,
+    SelfHarmMultimodalEvaluator,
+    SexualMultimodalEvaluator,
+    ViolenceMultimodalEvaluator,
+)
+from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator
 from ._evaluators._f1_score import F1ScoreEvaluator
 from ._evaluators._fluency import FluencyEvaluator
 from ._evaluators._gleu import GleuScoreEvaluator
@@ -65,4 +73,10 @@
     "Conversation",
     "Message",
     "EvaluationResult",
+    "ContentSafetyMultimodalEvaluator",
+    "HateUnfairnessMultimodalEvaluator",
+    "SelfHarmMultimodalEvaluator",
+    "SexualMultimodalEvaluator",
+    "ViolenceMultimodalEvaluator",
+    "ProtectedMaterialMultimodalEvaluator",
 ]
@@ -13,10 +13,12 @@
 
 import jwt
 
+from promptflow.core._errors import MissingRequiredPackage
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
 from azure.ai.evaluation._model_configurations import AzureAIProject
 from azure.core.credentials import TokenCredential
+from azure.core.exceptions import HttpResponseError
 from azure.core.pipeline.policies import AsyncRetryPolicy
 
 from .constants import (
@@ -26,7 +28,7 @@
     Tasks,
     _InternalEvaluationMetrics,
 )
-from .utils import get_harm_severity_level
+from .utils import get_harm_severity_level, retrieve_content_type
 
 try:
     version = importlib.metadata.version("azure-ai-evaluation")
@@ -281,7 +283,11 @@ def _parse_content_harm_response(
     if key == EvaluationMetrics.HATE_FAIRNESS:
         key = EvaluationMetrics.HATE_UNFAIRNESS
 
-    result: Dict[str, Union[str, float]] = {key: math.nan, key + "_score": math.nan, key + "_reason": ""}
+    result: Dict[str, Union[str, float]] = {
+        (key.value if hasattr(key, "value") else key): math.nan,
+        f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
+        f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
+    }
 
     response = batch_response[0]
     if metric_name not in response:
@@ -330,6 +336,8 @@ def _parse_content_harm_response(
         reason = ""
 
     harm_score = metric_value
+    if metric_value == "n/a":
+        return result
     if not math.isnan(metric_value):
         # int(math.nan) causes a value error, and math.nan is already handled
         # by get_harm_severity_level
@@ -465,3 +473,109 @@ async def evaluate_with_rai_service(
     result = parse_response(annotation_response, metric_name, metric_display_name)
 
     return result
+
+
+def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
+    """Generate the payload for the annotation request
+    :param content_type: The type of the content representing multimodal or images.
+    :type content_type: str
+    :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
+    :type messages: str
+    :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
+        in the payload.
+    :type metric: str
+    :return: The payload for the annotation request.
+    :rtype: Dict
+    """
+    include_metric = True
+    task = Tasks.CONTENT_HARM
+    if metric == EvaluationMetrics.PROTECTED_MATERIAL:
+        task = Tasks.PROTECTED_MATERIAL
+        include_metric = False
+
+    if include_metric:
+        return {
+            "ContentType": content_type,
+            "Contents": [{"messages": messages}],
+            "AnnotationTask": task,
+            "MetricList": [metric],
+        }
+    return {
+        "ContentType": content_type,
+        "Contents": [{"messages": messages}],
+        "AnnotationTask": task,
+    }
+
+
+async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
+    """Submit request to Responsible AI service for evaluation and return operation ID
+    :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
+    :type messages: str
+    :param metric: The evaluation metric to use.
+    :type metric: str
+    :param rai_svc_url: The Responsible AI service URL.
+    :type rai_svc_url: str
+    :param token: The Azure authentication token.
+    :type token: str
+    :return: The operation ID.
+    :rtype: str
+    """
+    ## handle json payload and payload from inference sdk strongly type messages
+    if len(messages) > 0 and not isinstance(messages[0], dict):
+        try:
+            from azure.ai.inference.models import ChatRequestMessage
+        except ImportError as ex:
+            error_message = (
+                "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
+            )
+            raise MissingRequiredPackage(message=error_message) from ex
+        if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
+            messages = [message.as_dict() for message in messages]
+
+    filtered_messages = [message for message in messages if message["role"] != "system"]
+    assistant_messages = [message for message in messages if message["role"] == "assistant"]
+    content_type = retrieve_content_type(assistant_messages, metric)
+    payload = generate_payload_multimodal(content_type, filtered_messages, metric)
+
+    ## calling rai service for annotation
+    url = rai_svc_url + "/submitannotation"
+    headers = get_common_headers(token)
+    async with get_async_http_client() as client:
+        response = await client.post(  # pylint: disable=too-many-function-args,unexpected-keyword-arg
+            url, json=payload, headers=headers
+        )
+    if response.status_code != 202:
+        raise HttpResponseError(
+            message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
+        )
+    result = response.json()
+    operation_id = result["location"].split("/")[-1]
+    return operation_id
+
+
+async def evaluate_with_rai_service_multimodal(
+    messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
+):
+    """ "Evaluate the content safety of the response using Responsible AI service
+       :param messages: The normalized list of messages.
+       :type messages: str
+       :param metric_name: The evaluation metric to use.
+       :type metric_name: str
+       :param project_scope: The Azure AI project scope details.
+       :type project_scope: Dict
+       :param credential: The Azure authentication credential.
+       :type credential:
+    ~azure.core.credentials.TokenCredential
+       :return: The parsed annotation result.
+       :rtype: List[List[Dict]]
+    """
+
+    # Get RAI service URL from discovery service and check service availability
+    token = await fetch_or_reuse_token(credential)
+    rai_svc_url = await get_rai_svc_url(project_scope, token)
+    await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
+    # Submit annotation request and fetch result
+    operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
+    annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
+    result = parse_response(annotation_response, metric_name)
+    return result
@@ -9,9 +9,9 @@
 
 import nltk
 from typing_extensions import NotRequired, Required, TypeGuard
-
+from promptflow.core._errors import MissingRequiredPackage
 from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, EvaluationException
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -312,3 +312,100 @@ def remove_optional_singletons(eval_class, singletons):
             if param in singletons:
                 del required_singletons[param]
     return required_singletons
+
+
+def retrieve_content_type(assistant_messages: List, metric: str) -> str:
+    """Get the content type for service payload.
+
+    :param assistant_messages: The list of messages to be annotated by evaluation service
+    :type assistant_messages: list
+    :param metric: A string representing the metric type
+    :type metric: str
+    :return: A text representing the content type. Example: 'text', or 'image'
+    :rtype: str
+    """
+    # Check if metric is "protected_material"
+    if metric == "protected_material":
+        return "image"
+
+    # Iterate through each message
+    for item in assistant_messages:
+        # Ensure "content" exists in the message and is iterable
+        content = item.get("content", [])
+        for message in content:
+            if message.get("type", "") == "image_url":
+                return "image"
+    # Default return if no image was found
+    return "text"
+
+
+def validate_conversation(conversation):
+    def raise_exception(msg, target):
+        raise EvaluationException(
+            message=msg,
+            internal_message=msg,
+            target=target,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+
+    if not conversation or "messages" not in conversation:
+        raise_exception(
+            "Attribute 'messages' is missing in the request",
+            ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
+        )
+    messages = conversation["messages"]
+    if not isinstance(messages, list):
+        raise_exception(
+            "'messages' parameter must be a JSON-compatible list of chat messages",
+            ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
+        )
+    expected_roles = {"user", "assistant", "system"}
+    image_found = False
+    for num, message in enumerate(messages, 1):
+        if not isinstance(message, dict):
+            try:
+                from azure.ai.inference.models import (
+                    ChatRequestMessage,
+                    UserMessage,
+                    AssistantMessage,
+                    SystemMessage,
+                    ImageContentItem,
+                )
+            except ImportError as ex:
+                raise MissingRequiredPackage(
+                    message="Please install 'azure-ai-inference' package to use SystemMessage, AssistantMessage"
+                ) from ex
+
+            if isinstance(messages[0], ChatRequestMessage) and not isinstance(
+                message, (UserMessage, AssistantMessage, SystemMessage)
+            ):
+                raise_exception(
+                    f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
+                    ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
+                )
+
+            if isinstance(message.content, list) and any(
+                isinstance(item, ImageContentItem) for item in message.content
+            ):
+                image_found = True
+            continue
+        if message.get("role") not in expected_roles:
+            raise_exception(
+                f"Invalid role provided: {message.get('role')}. Message number: {num}",
+                ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
+            )
+        content = message.get("content")
+        if not isinstance(content, (str, list)):
+            raise_exception(
+                f"Content in each turn must be a string or array. Message number: {num}",
+                ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
+            )
+        if isinstance(content, list):
+            if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
+                image_found = True
+    if not image_found:
+        raise_exception(
+            "Message needs to have multi-modal input like images.",
+            ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR,
+        )
@@ -8,6 +8,8 @@
 import tempfile
 from pathlib import Path
 from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
+import uuid
+import base64
 
 import pandas as pd
 from promptflow.client import PFClient
@@ -81,6 +83,33 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork
     return azure_pf_client, ws_triad
 
 
+def _store_multimodal_content(messages, tmpdir: str):
+    # verify if images folder exists
+    images_folder_path = os.path.join(tmpdir, "images")
+    os.makedirs(images_folder_path, exist_ok=True)
+
+    # traverse all messages and replace base64 image data with new file name.
+    for message in messages:
+        for content in message.get("content", []):
+            if content.get("type") == "image_url":
+                image_url = content.get("image_url")
+                if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
+                    # Extract the base64 string
+                    base64image = image_url["url"].replace("data:image/jpg;base64,", "")
+
+                    # Generate a unique filename
+                    image_file_name = f"{str(uuid.uuid4())}.jpg"
+                    image_url["url"] = f"images/{image_file_name}"  # Replace the base64 URL with the file path
+
+                    # Decode the base64 string to binary image data
+                    image_data_binary = base64.b64decode(base64image)
+
+                    # Write the binary image data to the file
+                    image_file_path = os.path.join(images_folder_path, image_file_name)
+                    with open(image_file_path, "wb") as f:
+                        f.write(image_data_binary)
+
+
 def _log_metrics_and_instance_results(
     metrics: Dict[str, Any],
     instance_results: pd.DataFrame,
@@ -110,6 +139,15 @@ def _log_metrics_and_instance_results(
         artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
 
         with tempfile.TemporaryDirectory() as tmpdir:
+            # storing multi_modal images if exists
+            col_name = "inputs.conversation"
+            if col_name in instance_results.columns:
+                for item in instance_results[col_name].items():
+                    value = item[1]
+                    if "messages" in value:
+                        _store_multimodal_content(value["messages"], tmpdir)
+
+            # storing artifact result
             tmp_path = os.path.join(tmpdir, artifact_name)
 
             with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
 
@@ -99,10 +99,10 @@ def __init__(
         self._eval_last_turn = eval_last_turn
         self._parallel = parallel
         self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
-            ViolenceEvaluator(azure_ai_project, credential),
-            SexualEvaluator(azure_ai_project, credential),
-            SelfHarmEvaluator(azure_ai_project, credential),
-            HateUnfairnessEvaluator(azure_ai_project, credential),
+            ViolenceEvaluator(credential, azure_ai_project),
+            SexualEvaluator(credential, azure_ai_project),
+            SelfHarmEvaluator(credential, azure_ai_project),
+            HateUnfairnessEvaluator(credential, azure_ai_project),
         ]
 
     def __call__(self, *, conversation: list, **kwargs):
Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,5 @@`
`2`	`2`	`"AssetsRepo": "Azure/azure-sdk-assets",`
`3`	`3`	`"AssetsRepoPrefixPath": "python",`
`4`	`4`	`"TagPrefix": "python/evaluation/azure-ai-evaluation",`
`5`		`- "Tag": "python/evaluation/azure-ai-evaluation_f0444ef220"`
	`5`	`+ "Tag": "python/evaluation/azure-ai-evaluation_eb4989f81d"`
`6`	`6`	`}`