Skip to content

Add overloads for __call__ methods that accept query/response and conversation #38097

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from typing import Optional

from typing_extensions import override
from typing_extensions import overload, override

from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase

Expand Down Expand Up @@ -49,6 +49,42 @@ def __init__(self, model_config):
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)

@overload
def __call__(
self,
*,
query: str,
response: str,
):
"""Evaluate coherence for given input of query, response

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: str
:return: The coherence score.
:rtype: Dict[str, float]
"""
...

@overload
def __call__(
self,
*,
conversation,
**kwargs,
):
"""Evaluate coherence for a conversation

:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
key "messages", and potentially a global context under the key "context". Conversation turns are expected
to be dictionaries with keys "content", "role", and possibly "context".
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
:return: The coherence score.
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
"""
...

@override
def __call__(
self,
Expand All @@ -58,19 +94,4 @@ def __call__(
conversation=None,
**kwargs,
):
"""Evaluate coherence. Accepts either a query and response for a single evaluation,
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
turns, the evaluator will aggregate the results of each turn.

:keyword response: The response to be evaluated.
:paramtype response: Optional[str]
:keyword context: The context to be evaluated.
:paramtype context: Optional[str]
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
key "messages". Conversation turns are expected
to be dictionaries with keys "content" and "role".
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
:return: The relevance score.
:rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
"""
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Callable, Dict, List, Optional, Union

from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
from typing_extensions import override
from typing_extensions import overload, override

from azure.ai.evaluation._common._experimental import experimental
from azure.ai.evaluation._evaluators._common import EvaluatorBase
Expand Down Expand Up @@ -78,29 +78,51 @@ def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, *
HateUnfairnessEvaluator(credential, azure_ai_project),
]

@override
@overload
def __call__(
self,
*,
query: Optional[str] = None,
response: Optional[str] = None,
conversation=None,
**kwargs,
query: str,
response: str,
):
"""Evaluate a collection of content safety metrics for the given query/response pair or conversation.
This inputs must supply either a query AND response, or a conversation, but not both.
"""Evaluate a collection of content safety metrics for the given query/response pair

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: str
:return: The content safety scores.
:rtype: Dict[str, Union[str, float]]
"""
...

:keyword query: The query to evaluate.
:paramtype query: Optional[str]
:keyword response: The response to evaluate.
:paramtype response: Optional[str]
@overload
def __call__(
self,
*,
conversation,
**kwargs,
):
"""Evaluate a collection of content safety metrics for a conversation

:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
key "messages", and potentially a global context under the key "context". Conversation turns are expected
to be dictionaries with keys "content", "role", and possibly "context".
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
:return: The evaluation result.
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
:return: The content safety scores.
:rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
"""
...

@override
def __call__(
self,
*,
query: Optional[str] = None,
response: Optional[str] = None,
conversation=None,
**kwargs,
):
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)

@override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# ---------------------------------------------------------
from typing import Optional, Union

from typing_extensions import override
from typing_extensions import overload, override

from azure.ai.evaluation._common._experimental import experimental
from azure.ai.evaluation._common.constants import EvaluationMetrics
Expand Down Expand Up @@ -57,28 +57,50 @@ def __init__(
credential=credential,
eval_last_turn=eval_last_turn,
)

@override
@overload
def __call__(
self,
*,
query: Optional[str] = None,
response: Optional[str] = None,
conversation=None,
**kwargs,
query: str,
response: str,
):
"""
Evaluate whether hateful content is present in your AI system's response.

"""Evaluate the given query/response pair for hateful content

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: str
:return: The hate score
:rtype: Dict[str, Union[str, float]]
"""
...

@overload
def __call__(
self,
*,
conversation,
**kwargs,
):
"""Evaluate a conversation for hateful content

:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
key "messages". Conversation turns are expected
to be dictionaries with keys "content" and "role".
key "messages", and potentially a global context under the key "context". Conversation turns are expected
to be dictionaries with keys "content", "role", and possibly "context".
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
:return: The fluency score.
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
:return: The hate score
:rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
"""
...

@override
def __call__(
self,
*,
query: Optional[str] = None,
response: Optional[str] = None,
conversation=None,
**kwargs,
):
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# ---------------------------------------------------------
from typing import Optional, Union

from typing_extensions import override
from typing_extensions import overload, override

from azure.ai.evaluation._common._experimental import experimental
from azure.ai.evaluation._common.constants import EvaluationMetrics
Expand Down Expand Up @@ -58,27 +58,49 @@ def __init__(
eval_last_turn=eval_last_turn,
)

@override
@overload
def __call__(
self,
*,
query: Optional[str] = None,
response: Optional[str] = None,
conversation=None,
**kwargs,
query: str,
response: str,
):
"""
Evaluate whether self-harm content is present in your AI system's response.

"""Evaluate a given query/response pair for self-harm content

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: str
:return: The self-harm score
:rtype: Dict[str, Union[str, float]]
"""
...

@overload
def __call__(
self,
*,
conversation,
**kwargs,
):
"""Evaluate a conversation for self-harm content

:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
key "messages". Conversation turns are expected
to be dictionaries with keys "content" and "role".
key "messages", and potentially a global context under the key "context". Conversation turns are expected
to be dictionaries with keys "content", "role", and possibly "context".
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
:return: The fluency score.
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
:return: The self-harm score
:rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
"""
...

@override
def __call__(
self,
*,
query: Optional[str] = None,
response: Optional[str] = None,
conversation=None,
**kwargs,
):
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# ---------------------------------------------------------
from typing import Optional, Union

from typing_extensions import override
from typing_extensions import overload, override

from azure.ai.evaluation._common._experimental import experimental
from azure.ai.evaluation._common.constants import EvaluationMetrics
Expand Down Expand Up @@ -58,27 +58,49 @@ def __init__(
eval_last_turn=eval_last_turn,
)

@override
@overload
def __call__(
self,
*,
query: Optional[str] = None,
response: Optional[str] = None,
conversation=None,
**kwargs,
query: str,
response: str,
):
"""
Evaluate whether sexual content is present in your AI system's response.

"""Evaluate a given query/response pair for sexual content

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: str
:return: The sexual score
:rtype: Dict[str, Union[str, float]]
"""
...

@overload
def __call__(
self,
*,
conversation,
**kwargs,
):
"""Evaluate a conversation for sexual content

:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
key "messages". Conversation turns are expected
to be dictionaries with keys "content" and "role".
key "messages", and potentially a global context under the key "context". Conversation turns are expected
to be dictionaries with keys "content", "role", and possibly "context".
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
:return: The fluency score.
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
:return: The sexual score
:rtype: Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]
"""
...

@override
def __call__(
self,
*,
query: Optional[str] = None,
response: Optional[str] = None,
conversation=None,
**kwargs,
):
return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
Loading
Loading