Skip to content

Commit c215c05

Browse files
committed
fix: test and prompt
1 parent c443099 commit c215c05

File tree

3 files changed

+21
-8
lines changed

3 files changed

+21
-8
lines changed

Diff for: mcpx_eval/constants.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
- The LLMs being tested may have different tools available from the judge
66
- All numeric scores should be scored from 0.0 - 100.0, where 100 is the best score and 0 is the worst
77
- The original prompt provided to the LLM can be found between the <prompt></prompt> tags
8-
- The output of the LLM for the given prompt can be found between the <output></output> tags, this should map to the `llm_output` field
8+
- The output of the LLM for the given prompt can be found between the <output></output> tags, this is an array of the various
9+
messages sent and tools used. The final_result message should be used to fill the `llm_output` field
910
- Additional information and context for each evaluation is included in the <settings></settings> section
1011
- The <expected-tools></expected-tools> section is provided by the user to list which tools may be to be used to execute the specified task
1112
if all of the tools listed aren't used it should not affect the score, however it is not good for non-expected tools to be used
@@ -25,6 +26,7 @@
2526
- The quality score should reflect the overall clearness and conciseness of the output
2627
- Try to utilize the tools that are available instead of searching for new tools
2728
- Not using any tools should deduct some points from the tool use score
29+
- The `description` field should contain a breakdown of why each score was awarded
2830
2931
Advanced evaluation metrics:
3032
- A guess should not be considered a hallucination, however it should affect the accuracy score

Diff for: mcpx_eval/judge.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,12 @@ async def evaluate_model(
167167
system_prompt=TEST_PROMPT,
168168
retries=5,
169169
)
170-
result["tools-available"] = list(chat.client.tools.keys())
170+
# Get available tools, handling both real and mock objects
171+
try:
172+
result["tools-available"] = list(chat.client.tools.keys())
173+
except (TypeError, AttributeError):
174+
# If tools is a mock object, get the return value directly
175+
result["tools-available"] = chat.client.tools.keys()
171176

172177
async for node in chat.iter(prompt):
173178
if hasattr(node, "model_response"):
@@ -276,9 +281,9 @@ async def run(
276281
<prompt>
277282
{prompt}
278283
</prompt>
279-
<o>
284+
<output>
280285
{json.dumps(result)}
281-
</o>
286+
</output>
282287
<check>{check}</check>
283288
<expected-tools>{", ".join(expected_tools)}</expected-tools>
284289
""")

Diff for: tests/test_mcpx.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -157,13 +157,19 @@ class TestJudgeEvaluation(unittest.IsolatedAsyncioTestCase):
157157
@patch('mcpx_eval.judge.mcp_run')
158158
async def test_evaluate_model_success(self, mock_mcp_run, mock_chat):
159159
"""Test successful model evaluation"""
160-
# Setup mock mcp_run.Client
161-
mock_client = Mock()
160+
# Setup mock mcp_run.Client with proper tools attribute
161+
mock_tools = MagicMock()
162+
mock_tools.keys.return_value = ["test_tool"]
163+
mock_client = MagicMock()
164+
mock_client.tools = mock_tools
162165
mock_mcp_run.Client = Mock(return_value=mock_client)
163166
mock_mcp_run.ClientConfig = Mock()
167+
168+
# Setup mock chat instance
169+
mock_chat_instance = MagicMock()
170+
mock_chat_instance.client = mock_client
164171

165-
# Setup mock responses
166-
mock_chat_instance = Mock()
172+
# Setup response parts
167173
model_response_parts = [
168174
MockPart(
169175
part_kind="text",

0 commit comments

Comments
 (0)