Skip to content

Commit 88f13fd

Browse files
authored
[evaluation] Add support for using evaluate() with evaluators that have missing inputs (#38276) (#38299)
* Update evaluate to allow optional params + add tests * Record tests * Exclude optional params from 'missing inputs' warning * Add tests * Record tests * Fix linting errors * Merge from main and fix linting error * Add unit test for missing output * Update convo test to assert per turn results * Run linting * Add test data * Re-record test
1 parent d351824 commit 88f13fd

File tree

7 files changed

+126
-5
lines changed

7 files changed

+126
-5
lines changed

sdk/evaluation/azure-ai-evaluation/assets.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
"AssetsRepo": "Azure/azure-sdk-assets",
33
"AssetsRepoPrefixPath": "python",
44
"TagPrefix": "python/evaluation/azure-ai-evaluation",
5-
"Tag": "python/evaluation/azure-ai-evaluation_043418c052"
5+
"Tag": "python/evaluation/azure-ai-evaluation_f6269c684c"
66
}

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,13 @@ def _validate_columns_for_evaluators(
287287
# Ignore the missing fields if "conversation" presents in the input data
288288
missing_inputs = []
289289
else:
290-
missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
290+
optional_params = (
291+
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
292+
if hasattr(evaluator, "_OPTIONAL_PARAMS")
293+
else []
294+
)
295+
excluded_params = set(new_df.columns).union(optional_params)
296+
missing_inputs = [col for col in evaluator_params if col not in excluded_params]
291297

292298
# If "conversation" is the only parameter and it is missing, keep it in the missing inputs
293299
# Otherwise, remove it from the missing inputs

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -179,9 +179,7 @@ async def __call__(
179179
if not randomization_seed:
180180
randomization_seed = randint(0, 1000000)
181181

182-
regular_sim = AdversarialSimulator(
183-
azure_ai_project=self.azure_ai_project, credential=self.credential
184-
)
182+
regular_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
185183
regular_sim_results = await regular_sim(
186184
scenario=scenario,
187185
target=target,

sdk/evaluation/azure-ai-evaluation/tests/conftest.py

+9
Original file line numberDiff line numberDiff line change
@@ -142,11 +142,20 @@ def live_connection_file_values():
142142
add_general_regex_sanitizer(regex=project_scope["project_name"], value=SanitizedValues.WORKSPACE_NAME)
143143
add_general_regex_sanitizer(regex=model_config["azure_endpoint"], value=mock_model_config["azure_endpoint"])
144144

145+
def promptflow_root_run_id_sanitizer():
146+
"""Sanitize the promptflow service isolation values."""
147+
add_general_regex_sanitizer(
148+
value="root_run_id",
149+
regex=r'"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_[^"]+"',
150+
replacement='"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_SANITIZED"',
151+
)
152+
145153
azure_workspace_triad_sanitizer()
146154
azureopenai_connection_sanitizer()
147155
openai_stainless_default_headers()
148156
azure_ai_generative_sanitizer()
149157
live_connection_file_values()
158+
promptflow_root_run_id_sanitizer()
150159

151160

152161
@pytest.fixture
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"context":"AML API only","response":"To create a run using the Azure Machine Learning API, you first need to create an Experiment. Once you have an experiment, you can create a Run object that is associated with that experiment. Here is some Python code that demonstrates this process:\n\n```\nfrom azureml.core import Experiment, Run\nfrom azureml.core.workspace import Workspace\n\n# Define workspace and experiment\nws = Workspace.from_config()\nexp = Experiment(workspace=ws, name='my_experiment')\n\n# Create a new run\nrun = exp.start_logging()\n```\n\nIn this code, the `from_config()` method reads the configuration file that you created when you set up your Azure Machine Learning workspace. The `Experiment` constructor creates an Experiment object that is associated with your workspace, and the `start_logging()` method creates a new Run object that is associated with the Experiment. Now you can use the `run` object to log metrics, upload files, and track other information related to your machine learning experiment.", "ground_truth":"Paris is the capital of France."}
2+
{"context":"Logging can be done using any OSS Sdk","response":"There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`.","ground_truth":"Paris is the capital of France."}
3+
{"context":"France is in Europe","response":"Paris is the capital of France.", "ground_truth":"Paris is the capital of France."}

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py

+95
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@ def data_file():
2929
return os.path.join(data_path, "evaluate_test_data.jsonl")
3030

3131

32+
@pytest.fixture
33+
def data_file_no_query():
34+
data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
35+
return os.path.join(data_path, "evaluate_test_data_no_query.jsonl")
36+
37+
3238
@pytest.fixture
3339
def data_convo_file():
3440
data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
@@ -725,3 +731,92 @@ def test_evaluate_aggregation(self, data_file, return_json, aggregate_return_jso
725731
@pytest.mark.skip(reason="TODO: Add test back")
726732
def test_prompty_with_threadpool_implementation(self):
727733
pass
734+
735+
def test_evaluate_with_groundedness_evaluator_with_query(self, model_config, data_file):
736+
# data
737+
input_data = pd.read_json(data_file, lines=True)
738+
739+
groundedness_eval = GroundednessEvaluator(model_config)
740+
741+
# run the evaluation
742+
result = evaluate(
743+
data=data_file,
744+
evaluators={"grounded": groundedness_eval},
745+
)
746+
747+
row_result_df = pd.DataFrame(result["rows"])
748+
metrics = result["metrics"]
749+
750+
# validate the results
751+
assert result is not None
752+
assert result["rows"] is not None
753+
assert row_result_df.shape[0] == len(input_data)
754+
assert "outputs.grounded.groundedness" in row_result_df.columns.to_list()
755+
assert "grounded.groundedness" in metrics.keys()
756+
assert metrics.get("grounded.groundedness") == list_mean_nan_safe(
757+
row_result_df["outputs.grounded.groundedness"]
758+
)
759+
assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5]
760+
assert result["studio_url"] is None
761+
762+
def test_evaluate_with_groundedness_evaluator_without_query(self, model_config, data_file_no_query):
763+
# data
764+
input_data = pd.read_json(data_file_no_query, lines=True)
765+
766+
groundedness_eval = GroundednessEvaluator(model_config)
767+
768+
# run the evaluation
769+
result = evaluate(
770+
data=data_file_no_query,
771+
evaluators={"grounded": groundedness_eval},
772+
)
773+
774+
row_result_df = pd.DataFrame(result["rows"])
775+
metrics = result["metrics"]
776+
777+
# validate the results
778+
assert result is not None
779+
assert result["rows"] is not None
780+
assert row_result_df.shape[0] == len(input_data)
781+
782+
assert "outputs.grounded.groundedness" in row_result_df.columns.to_list()
783+
784+
assert "grounded.groundedness" in metrics.keys()
785+
786+
assert metrics.get("grounded.groundedness") == list_mean_nan_safe(
787+
row_result_df["outputs.grounded.groundedness"]
788+
)
789+
790+
assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5]
791+
assert result["studio_url"] is None
792+
793+
def test_evaluate_with_groundedness_evaluator_with_convo(self, model_config, data_convo_file):
794+
# data
795+
input_data = pd.read_json(data_convo_file, lines=True)
796+
797+
groundedness_eval = GroundednessEvaluator(model_config)
798+
799+
# run the evaluation
800+
result = evaluate(
801+
data=data_convo_file,
802+
evaluators={"grounded": groundedness_eval},
803+
)
804+
805+
row_result_df = pd.DataFrame(result["rows"])
806+
metrics = result["metrics"]
807+
808+
# validate the results
809+
assert result is not None
810+
assert result["rows"] is not None
811+
assert row_result_df.shape[0] == len(input_data)
812+
813+
assert "outputs.grounded.groundedness" in row_result_df.columns.to_list()
814+
assert "outputs.grounded.evaluation_per_turn" in row_result_df.columns.to_list()
815+
assert "grounded.groundedness" in metrics.keys()
816+
assert metrics.get("grounded.groundedness") == list_mean_nan_safe(
817+
row_result_df["outputs.grounded.groundedness"]
818+
)
819+
assert row_result_df["outputs.grounded.groundedness"][1] in [3, 4, 5]
820+
assert row_result_df["outputs.grounded.evaluation_per_turn"][0]["groundedness"][0] in [3.0, 4.0, 5.0]
821+
assert row_result_df["outputs.grounded.evaluation_per_turn"][0]["groundedness_reason"][0] is not None
822+
assert result["studio_url"] is None

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

+10
Original file line numberDiff line numberDiff line change
@@ -648,3 +648,13 @@ def test_optional_inputs_with_target(self, questions_file, questions_answers_bas
648648
) # type: ignore
649649
assert double_override_results["rows"][0]["outputs.echo.echo_query"] == "new query"
650650
assert double_override_results["rows"][0]["outputs.echo.echo_response"] == "new response"
651+
652+
def test_missing_inputs(self, questions_file):
653+
"""Test we are raising exception if required input is missing in data."""
654+
with pytest.raises(EvaluationException) as cm:
655+
evaluate(
656+
data=questions_file,
657+
target=_target_fn,
658+
evaluators={"f1": F1ScoreEvaluator()},
659+
)
660+
assert "Some evaluators are missing required inputs:\n- f1: ['ground_truth']\n\n" in cm.value.args[0]

0 commit comments

Comments
 (0)