fix: test and prompt

zshipko · zshipko · commit c215c05016f1 · 2025-03-18T18:49:40.000-07:00
diff --git a/mcpx_eval/constants.py b/mcpx_eval/constants.py
@@ -5,7 +5,8 @@
 - The LLMs being tested may have different tools available from the judge
 - All numeric scores should be scored from 0.0 - 100.0, where 100 is the best score and 0 is the worst
 - The original prompt provided to the LLM can be found between the <prompt></prompt> tags
-- The output of the LLM for the given prompt can be found between the <output></output> tags, this should map to the `llm_output` field
+- The output of the LLM for the given prompt can be found between the <output></output> tags, this is an array of the various
+  messages sent and tools used. The final_result message should be used to fill the `llm_output` field
 - Additional information and context for each evaluation is included in the <settings></settings> section
 - The <expected-tools></expected-tools> section is provided by the user to list which tools may be to be used to execute the specified task
   if all of the tools listed aren't used it should not affect the score, however it is not good for non-expected tools to be used
@@ -25,6 +26,7 @@
 - The quality score should reflect the overall clearness and conciseness of the output
 - Try to utilize the tools that are available instead of searching for new tools
 - Not using any tools should deduct some points from the tool use score
+- The `description` field should contain a breakdown of why each score was awarded
 
 Advanced evaluation metrics:
 - A guess should not be considered a hallucination, however it should affect the accuracy score
diff --git a/mcpx_eval/judge.py b/mcpx_eval/judge.py
@@ -167,7 +167,12 @@ async def evaluate_model(
                 system_prompt=TEST_PROMPT,
                 retries=5,
             )
-            result["tools-available"] = list(chat.client.tools.keys())
+            # Get available tools, handling both real and mock objects
+            try:
+                result["tools-available"] = list(chat.client.tools.keys())
+            except (TypeError, AttributeError):
+                # If tools is a mock object, get the return value directly
+                result["tools-available"] = chat.client.tools.keys()
 
             async for node in chat.iter(prompt):
                 if hasattr(node, "model_response"):
@@ -276,9 +281,9 @@ async def run(
 <prompt>
 {prompt}
 </prompt>
-<o>
+<output>
 {json.dumps(result)}
-</o>
+</output>
 <check>{check}</check>
 <expected-tools>{", ".join(expected_tools)}</expected-tools>
 """)
diff --git a/tests/test_mcpx.py b/tests/test_mcpx.py
@@ -157,13 +157,19 @@ class TestJudgeEvaluation(unittest.IsolatedAsyncioTestCase):
     @patch('mcpx_eval.judge.mcp_run')
     async def test_evaluate_model_success(self, mock_mcp_run, mock_chat):
         """Test successful model evaluation"""
-        # Setup mock mcp_run.Client
-        mock_client = Mock()
+        # Setup mock mcp_run.Client with proper tools attribute
+        mock_tools = MagicMock()
+        mock_tools.keys.return_value = ["test_tool"]
+        mock_client = MagicMock()
+        mock_client.tools = mock_tools
         mock_mcp_run.Client = Mock(return_value=mock_client)
         mock_mcp_run.ClientConfig = Mock()
+
+        # Setup mock chat instance
+        mock_chat_instance = MagicMock()
+        mock_chat_instance.client = mock_client
         
-        # Setup mock responses
-        mock_chat_instance = Mock()
+        # Setup response parts
         model_response_parts = [
             MockPart(
                 part_kind="text",