Azure · needuv · Oct 30, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
@@ -4,7 +4,7 @@
 import os
 from typing import Optional
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 
@@ -49,6 +49,42 @@ def __init__(self, model_config):
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
 
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+    ):
+        """Evaluate coherence for given input of query, response
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :return: The coherence score.
+        :rtype: Dict[str, float]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate coherence for a conversation
+
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The coherence score.
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
+        ...
+
     @override
     def __call__(
         self,
@@ -58,19 +94,4 @@ def __call__(
         conversation=None,
         **kwargs,
     ):
-        """Evaluate coherence. Accepts either a query and response for a single evaluation,
-        or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
-        turns, the evaluator will aggregate the results of each turn.
-
-        :keyword response: The response to be evaluated.
-        :paramtype response: Optional[str]
-        :keyword context: The context to be evaluated.
-        :paramtype context: Optional[str]
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
-        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The relevance score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
-        """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -5,7 +5,7 @@
 from typing import Callable, Dict, List, Optional, Union
 
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
@@ -78,29 +78,51 @@ def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, *
             HateUnfairnessEvaluator(credential, azure_ai_project),
         ]
 
-    @override
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """Evaluate a collection of content safety metrics for the given query/response pair or conversation.
-        This inputs must supply either a query AND response, or a conversation, but not both.
+        """Evaluate a collection of content safety metrics for the given query/response pair
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :return: The content safety scores.
+        :rtype: Dict[str, Union[str, float]]
+        """
+        ...
 
-        :keyword query: The query to evaluate.
-        :paramtype query: Optional[str]
-        :keyword response: The response to evaluate.
-        :paramtype response: Optional[str]
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate a collection of content safety metrics for a conversation
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The evaluation result.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :return: The content safety scores.
+        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
 
     @override

@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 from typing import Optional, Union
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -57,28 +57,50 @@ def __init__(
             credential=credential,
             eval_last_turn=eval_last_turn,
         )
-
-    @override
+    
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """
-        Evaluate whether hateful content is present in your AI system's response.
-
+        """Evaluate the given query/response pair for hateful content
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
+        :return: The hate score
+        :rtype: Dict[str, Union[str, float]]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate a conversation for hateful content
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :return: The hate score
+        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 from typing import Optional, Union
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -58,27 +58,49 @@ def __init__(
             eval_last_turn=eval_last_turn,
         )
 
-    @override
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """
-        Evaluate whether self-harm content is present in your AI system's response.
-
+        """Evaluate a given query/response pair for self-harm content
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
+        :return: The self-harm score
+        :rtype: Dict[str, Union[str, float]]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate a conversation for self-harm content
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :return: The self-harm score
+        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 from typing import Optional, Union
 
-from typing_extensions import override
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
@@ -58,27 +58,49 @@ def __init__(
             eval_last_turn=eval_last_turn,
         )
 
-    @override
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
-        **kwargs,
+        query: str,
+        response: str,
     ):
-        """
-        Evaluate whether sexual content is present in your AI system's response.
-
+        """Evaluate a given query/response pair for sexual content
+
         :keyword query: The query to be evaluated.
         :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: str
+        :return: The sexual score
+        :rtype: Dict[str, Union[str, float]]
+        """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation,
+        **kwargs,
+    ):
+        """Evaluate a conversation for sexual content
+
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :return: The sexual score
+        :rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
         """
+        ...
+
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)