diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index 356e246839ab..1e52dc68cc08 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_043418c052" + "Tag": "python/evaluation/azure-ai-evaluation_f6269c684c" } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 5ae9ebca6548..f879def1501a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -287,7 +287,13 @@ def _validate_columns_for_evaluators( # Ignore the missing fields if "conversation" presents in the input data missing_inputs = [] else: - missing_inputs = [col for col in evaluator_params if col not in new_df.columns] + optional_params = ( + evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access + if hasattr(evaluator, "_OPTIONAL_PARAMS") + else [] + ) + excluded_params = set(new_df.columns).union(optional_params) + missing_inputs = [col for col in evaluator_params if col not in excluded_params] # If "conversation" is the only parameter and it is missing, keep it in the missing inputs # Otherwise, remove it from the missing inputs diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py index 7278ff3be2de..9da9d930acc4 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/simulator/_direct_attack_simulator.py @@ -179,9 +179,7 @@ async def __call__( if not randomization_seed: randomization_seed = randint(0, 1000000) - regular_sim = AdversarialSimulator( - azure_ai_project=self.azure_ai_project, credential=self.credential - ) + regular_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential) regular_sim_results = await regular_sim( scenario=scenario, target=target, diff --git a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py index 4c27d1cf0c90..9302f64c161c 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py @@ -142,11 +142,20 @@ def live_connection_file_values(): add_general_regex_sanitizer(regex=project_scope["project_name"], value=SanitizedValues.WORKSPACE_NAME) add_general_regex_sanitizer(regex=model_config["azure_endpoint"], value=mock_model_config["azure_endpoint"]) + def promptflow_root_run_id_sanitizer(): + """Sanitize the promptflow service isolation values.""" + add_general_regex_sanitizer( + value="root_run_id", + regex=r'"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_[^"]+"', + replacement='"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_SANITIZED"', + ) + azure_workspace_triad_sanitizer() azureopenai_connection_sanitizer() openai_stainless_default_headers() azure_ai_generative_sanitizer() live_connection_file_values() + promptflow_root_run_id_sanitizer() @pytest.fixture diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_no_query.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_no_query.jsonl new file mode 100644 index 000000000000..e807a941f640 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_no_query.jsonl @@ -0,0 +1,3 @@ +{"context":"AML API only","response":"To create a run using the Azure Machine Learning API, you first need to create an Experiment. Once you have an experiment, you can create a Run object that is associated with that experiment. Here is some Python code that demonstrates this process:\n\n```\nfrom azureml.core import Experiment, Run\nfrom azureml.core.workspace import Workspace\n\n# Define workspace and experiment\nws = Workspace.from_config()\nexp = Experiment(workspace=ws, name='my_experiment')\n\n# Create a new run\nrun = exp.start_logging()\n```\n\nIn this code, the `from_config()` method reads the configuration file that you created when you set up your Azure Machine Learning workspace. The `Experiment` constructor creates an Experiment object that is associated with your workspace, and the `start_logging()` method creates a new Run object that is associated with the Experiment. Now you can use the `run` object to log metrics, upload files, and track other information related to your machine learning experiment.", "ground_truth":"Paris is the capital of France."} +{"context":"Logging can be done using any OSS Sdk","response":"There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`.","ground_truth":"Paris is the capital of France."} +{"context":"France is in Europe","response":"Paris is the capital of France.", "ground_truth":"Paris is the capital of France."} \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py index 7570d7b3845d..625cf021e9d3 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py @@ -29,6 +29,12 @@ def data_file(): return os.path.join(data_path, "evaluate_test_data.jsonl") +@pytest.fixture +def data_file_no_query(): + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, "evaluate_test_data_no_query.jsonl") + + @pytest.fixture def data_convo_file(): data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") @@ -725,3 +731,92 @@ def test_evaluate_aggregation(self, data_file, return_json, aggregate_return_jso @pytest.mark.skip(reason="TODO: Add test back") def test_prompty_with_threadpool_implementation(self): pass + + def test_evaluate_with_groundedness_evaluator_with_query(self, model_config, data_file): + # data + input_data = pd.read_json(data_file, lines=True) + + groundedness_eval = GroundednessEvaluator(model_config) + + # run the evaluation + result = evaluate( + data=data_file, + evaluators={"grounded": groundedness_eval}, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + assert "outputs.grounded.groundedness" in row_result_df.columns.to_list() + assert "grounded.groundedness" in metrics.keys() + assert metrics.get("grounded.groundedness") == list_mean_nan_safe( + row_result_df["outputs.grounded.groundedness"] + ) + assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5] + assert result["studio_url"] is None + + def test_evaluate_with_groundedness_evaluator_without_query(self, model_config, data_file_no_query): + # data + input_data = pd.read_json(data_file_no_query, lines=True) + + groundedness_eval = GroundednessEvaluator(model_config) + + # run the evaluation + result = evaluate( + data=data_file_no_query, + evaluators={"grounded": groundedness_eval}, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + + assert "outputs.grounded.groundedness" in row_result_df.columns.to_list() + + assert "grounded.groundedness" in metrics.keys() + + assert metrics.get("grounded.groundedness") == list_mean_nan_safe( + row_result_df["outputs.grounded.groundedness"] + ) + + assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5] + assert result["studio_url"] is None + + def test_evaluate_with_groundedness_evaluator_with_convo(self, model_config, data_convo_file): + # data + input_data = pd.read_json(data_convo_file, lines=True) + + groundedness_eval = GroundednessEvaluator(model_config) + + # run the evaluation + result = evaluate( + data=data_convo_file, + evaluators={"grounded": groundedness_eval}, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + + assert "outputs.grounded.groundedness" in row_result_df.columns.to_list() + assert "outputs.grounded.evaluation_per_turn" in row_result_df.columns.to_list() + assert "grounded.groundedness" in metrics.keys() + assert metrics.get("grounded.groundedness") == list_mean_nan_safe( + row_result_df["outputs.grounded.groundedness"] + ) + assert row_result_df["outputs.grounded.groundedness"][1] in [3, 4, 5] + assert row_result_df["outputs.grounded.evaluation_per_turn"][0]["groundedness"][0] in [3.0, 4.0, 5.0] + assert row_result_df["outputs.grounded.evaluation_per_turn"][0]["groundedness_reason"][0] is not None + assert result["studio_url"] is None diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 9e26bf9a992b..f8e65f92a3d7 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -648,3 +648,13 @@ def test_optional_inputs_with_target(self, questions_file, questions_answers_bas ) # type: ignore assert double_override_results["rows"][0]["outputs.echo.echo_query"] == "new query" assert double_override_results["rows"][0]["outputs.echo.echo_response"] == "new response" + + def test_missing_inputs(self, questions_file): + """Test we are raising exception if required input is missing in data.""" + with pytest.raises(EvaluationException) as cm: + evaluate( + data=questions_file, + target=_target_fn, + evaluators={"f1": F1ScoreEvaluator()}, + ) + assert "Some evaluators are missing required inputs:\n- f1: ['ground_truth']\n\n" in cm.value.args[0]