File tree 3 files changed +21
-8
lines changed
3 files changed +21
-8
lines changed Original file line number Diff line number Diff line change 5
5
- The LLMs being tested may have different tools available from the judge
6
6
- All numeric scores should be scored from 0.0 - 100.0, where 100 is the best score and 0 is the worst
7
7
- The original prompt provided to the LLM can be found between the <prompt></prompt> tags
8
- - The output of the LLM for the given prompt can be found between the <output></output> tags, this should map to the `llm_output` field
8
+ - The output of the LLM for the given prompt can be found between the <output></output> tags, this is an array of the various
9
+ messages sent and tools used. The final_result message should be used to fill the `llm_output` field
9
10
- Additional information and context for each evaluation is included in the <settings></settings> section
10
11
- The <expected-tools></expected-tools> section is provided by the user to list which tools may be to be used to execute the specified task
11
12
if all of the tools listed aren't used it should not affect the score, however it is not good for non-expected tools to be used
25
26
- The quality score should reflect the overall clearness and conciseness of the output
26
27
- Try to utilize the tools that are available instead of searching for new tools
27
28
- Not using any tools should deduct some points from the tool use score
29
+ - The `description` field should contain a breakdown of why each score was awarded
28
30
29
31
Advanced evaluation metrics:
30
32
- A guess should not be considered a hallucination, however it should affect the accuracy score
Original file line number Diff line number Diff line change @@ -167,7 +167,12 @@ async def evaluate_model(
167
167
system_prompt = TEST_PROMPT ,
168
168
retries = 5 ,
169
169
)
170
- result ["tools-available" ] = list (chat .client .tools .keys ())
170
+ # Get available tools, handling both real and mock objects
171
+ try :
172
+ result ["tools-available" ] = list (chat .client .tools .keys ())
173
+ except (TypeError , AttributeError ):
174
+ # If tools is a mock object, get the return value directly
175
+ result ["tools-available" ] = chat .client .tools .keys ()
171
176
172
177
async for node in chat .iter (prompt ):
173
178
if hasattr (node , "model_response" ):
@@ -276,9 +281,9 @@ async def run(
276
281
<prompt>
277
282
{ prompt }
278
283
</prompt>
279
- <o >
284
+ <output >
280
285
{ json .dumps (result )}
281
- </o >
286
+ </output >
282
287
<check>{ check } </check>
283
288
<expected-tools>{ ", " .join (expected_tools )} </expected-tools>
284
289
""" )
Original file line number Diff line number Diff line change @@ -157,13 +157,19 @@ class TestJudgeEvaluation(unittest.IsolatedAsyncioTestCase):
157
157
@patch ('mcpx_eval.judge.mcp_run' )
158
158
async def test_evaluate_model_success (self , mock_mcp_run , mock_chat ):
159
159
"""Test successful model evaluation"""
160
- # Setup mock mcp_run.Client
161
- mock_client = Mock ()
160
+ # Setup mock mcp_run.Client with proper tools attribute
161
+ mock_tools = MagicMock ()
162
+ mock_tools .keys .return_value = ["test_tool" ]
163
+ mock_client = MagicMock ()
164
+ mock_client .tools = mock_tools
162
165
mock_mcp_run .Client = Mock (return_value = mock_client )
163
166
mock_mcp_run .ClientConfig = Mock ()
167
+
168
+ # Setup mock chat instance
169
+ mock_chat_instance = MagicMock ()
170
+ mock_chat_instance .client = mock_client
164
171
165
- # Setup mock responses
166
- mock_chat_instance = Mock ()
172
+ # Setup response parts
167
173
model_response_parts = [
168
174
MockPart (
169
175
part_kind = "text" ,
You can’t perform that action at this time.
0 commit comments