Skip to content

[evaluation] Add support for using evaluate() with evaluators that have missing inputs #38276

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Nov 2, 2024
Merged
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_043418c052"
"Tag": "python/evaluation/azure-ai-evaluation_f6269c684c"
}
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,13 @@ def _validate_columns_for_evaluators(
# Ignore the missing fields if "conversation" presents in the input data
missing_inputs = []
else:
missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
optional_params = (
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
if hasattr(evaluator, "_OPTIONAL_PARAMS")
else []
)
excluded_params = set(new_df.columns).union(optional_params)
missing_inputs = [col for col in evaluator_params if col not in excluded_params]

# If "conversation" is the only parameter and it is missing, keep it in the missing inputs
# Otherwise, remove it from the missing inputs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,7 @@ async def __call__(
if not randomization_seed:
randomization_seed = randint(0, 1000000)

regular_sim = AdversarialSimulator(
azure_ai_project=self.azure_ai_project, credential=self.credential
)
regular_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
regular_sim_results = await regular_sim(
scenario=scenario,
target=target,
Expand Down
9 changes: 9 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,20 @@ def live_connection_file_values():
add_general_regex_sanitizer(regex=project_scope["project_name"], value=SanitizedValues.WORKSPACE_NAME)
add_general_regex_sanitizer(regex=model_config["azure_endpoint"], value=mock_model_config["azure_endpoint"])

def promptflow_root_run_id_sanitizer():
"""Sanitize the promptflow service isolation values."""
add_general_regex_sanitizer(
value="root_run_id",
regex=r'"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_[^"]+"',
replacement='"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_SANITIZED"',
)

azure_workspace_triad_sanitizer()
azureopenai_connection_sanitizer()
openai_stainless_default_headers()
azure_ai_generative_sanitizer()
live_connection_file_values()
promptflow_root_run_id_sanitizer()


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"context":"AML API only","response":"To create a run using the Azure Machine Learning API, you first need to create an Experiment. Once you have an experiment, you can create a Run object that is associated with that experiment. Here is some Python code that demonstrates this process:\n\n```\nfrom azureml.core import Experiment, Run\nfrom azureml.core.workspace import Workspace\n\n# Define workspace and experiment\nws = Workspace.from_config()\nexp = Experiment(workspace=ws, name='my_experiment')\n\n# Create a new run\nrun = exp.start_logging()\n```\n\nIn this code, the `from_config()` method reads the configuration file that you created when you set up your Azure Machine Learning workspace. The `Experiment` constructor creates an Experiment object that is associated with your workspace, and the `start_logging()` method creates a new Run object that is associated with the Experiment. Now you can use the `run` object to log metrics, upload files, and track other information related to your machine learning experiment.", "ground_truth":"Paris is the capital of France."}
{"context":"Logging can be done using any OSS Sdk","response":"There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`.","ground_truth":"Paris is the capital of France."}
{"context":"France is in Europe","response":"Paris is the capital of France.", "ground_truth":"Paris is the capital of France."}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ def data_file():
return os.path.join(data_path, "evaluate_test_data.jsonl")


@pytest.fixture
def data_file_no_query():
data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
return os.path.join(data_path, "evaluate_test_data_no_query.jsonl")


@pytest.fixture
def data_convo_file():
data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
Expand Down Expand Up @@ -725,3 +731,92 @@ def test_evaluate_aggregation(self, data_file, return_json, aggregate_return_jso
@pytest.mark.skip(reason="TODO: Add test back")
def test_prompty_with_threadpool_implementation(self):
pass

def test_evaluate_with_groundedness_evaluator_with_query(self, model_config, data_file):
# data
input_data = pd.read_json(data_file, lines=True)

groundedness_eval = GroundednessEvaluator(model_config)

# run the evaluation
result = evaluate(
data=data_file,
evaluators={"grounded": groundedness_eval},
)

row_result_df = pd.DataFrame(result["rows"])
metrics = result["metrics"]

# validate the results
assert result is not None
assert result["rows"] is not None
assert row_result_df.shape[0] == len(input_data)
assert "outputs.grounded.groundedness" in row_result_df.columns.to_list()
assert "grounded.groundedness" in metrics.keys()
assert metrics.get("grounded.groundedness") == list_mean_nan_safe(
row_result_df["outputs.grounded.groundedness"]
)
assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5]
assert result["studio_url"] is None

def test_evaluate_with_groundedness_evaluator_without_query(self, model_config, data_file_no_query):
# data
input_data = pd.read_json(data_file_no_query, lines=True)

groundedness_eval = GroundednessEvaluator(model_config)

# run the evaluation
result = evaluate(
data=data_file_no_query,
evaluators={"grounded": groundedness_eval},
)

row_result_df = pd.DataFrame(result["rows"])
metrics = result["metrics"]

# validate the results
assert result is not None
assert result["rows"] is not None
assert row_result_df.shape[0] == len(input_data)

assert "outputs.grounded.groundedness" in row_result_df.columns.to_list()

assert "grounded.groundedness" in metrics.keys()

assert metrics.get("grounded.groundedness") == list_mean_nan_safe(
row_result_df["outputs.grounded.groundedness"]
)

assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5]
assert result["studio_url"] is None

def test_evaluate_with_groundedness_evaluator_with_convo(self, model_config, data_convo_file):
# data
input_data = pd.read_json(data_convo_file, lines=True)

groundedness_eval = GroundednessEvaluator(model_config)

# run the evaluation
result = evaluate(
data=data_convo_file,
evaluators={"grounded": groundedness_eval},
)

row_result_df = pd.DataFrame(result["rows"])
metrics = result["metrics"]

# validate the results
assert result is not None
assert result["rows"] is not None
assert row_result_df.shape[0] == len(input_data)

assert "outputs.grounded.groundedness" in row_result_df.columns.to_list()
assert "outputs.grounded.evaluation_per_turn" in row_result_df.columns.to_list()
assert "grounded.groundedness" in metrics.keys()
assert metrics.get("grounded.groundedness") == list_mean_nan_safe(
row_result_df["outputs.grounded.groundedness"]
)
assert row_result_df["outputs.grounded.groundedness"][1] in [3, 4, 5]
assert row_result_df["outputs.grounded.evaluation_per_turn"][0]["groundedness"][0] in [3.0, 4.0, 5.0]
assert row_result_df["outputs.grounded.evaluation_per_turn"][0]["groundedness_reason"][0] is not None
assert result["studio_url"] is None
Original file line number Diff line number Diff line change
Expand Up @@ -648,3 +648,13 @@ def test_optional_inputs_with_target(self, questions_file, questions_answers_bas
) # type: ignore
assert double_override_results["rows"][0]["outputs.echo.echo_query"] == "new query"
assert double_override_results["rows"][0]["outputs.echo.echo_response"] == "new response"

def test_missing_inputs(self, questions_file):
"""Test we are raising exception if required input is missing in data."""
with pytest.raises(EvaluationException) as cm:
evaluate(
data=questions_file,
target=_target_fn,
evaluators={"f1": F1ScoreEvaluator()},
)
assert "Some evaluators are missing required inputs:\n- f1: ['ground_truth']\n\n" in cm.value.args[0]