|
| 1 | +import json |
| 2 | +from typing import List |
| 3 | + |
| 4 | +from mcpx_pydantic_ai import BaseModel, Agent, Field |
| 5 | +from mcpx_py import Ollama, ChatConfig |
| 6 | + |
| 7 | + |
| 8 | +class Score(BaseModel): |
| 9 | + """ |
| 10 | + Used to score the result of an LLM tool call |
| 11 | + """ |
| 12 | + |
| 13 | + model: str = Field("Name of model being scored") |
| 14 | + output: str = Field("The literal output of the model being tested") |
| 15 | + description: str = Field("Description of results for this model") |
| 16 | + accuracy: float = Field("A score of how accurate the response is") |
| 17 | + tool_use: float = Field("A score of how appropriate the tool use is") |
| 18 | + overall: float = Field("An overall qualitative score of the response") |
| 19 | + |
| 20 | + |
| 21 | +class Scores(BaseModel): |
| 22 | + scores: List[Score] = Field("A list of scores for each model") |
| 23 | + |
| 24 | + |
| 25 | +SYSTEM_PROMPT = """ |
| 26 | +You are an large language model evaluator, you are an expert at comparing the output of various models based on |
| 27 | +accuracy, tool use and overall quality of the output. |
| 28 | +
|
| 29 | +- All numeric responses should be scored from 0.0 - 100.0, where 100 is the best score and 0 is the worst |
| 30 | +- Additional direction for each evaluation may be marked in the input between <direction></direction> tags |
| 31 | +- The tool use score should be based on whether or not the correct tool was used and whether the minimum amount |
| 32 | + of tools were used to accomplish a task. Over use of tools or repeated use of tools should deduct points from |
| 33 | + this score. |
| 34 | +- The accuracy score should reflect the accuracy of the result generally and taking into account the <direction> block |
| 35 | +- The overall score should reflect the overall quality of the output |
| 36 | +""" |
| 37 | + |
| 38 | + |
| 39 | +class Judge: |
| 40 | + def __init__(self, models: List[str] | None = None): |
| 41 | + if models is None: |
| 42 | + models = [] |
| 43 | + self.agent = Agent( |
| 44 | + "claude-3-5-sonnet-latest", result_type=Scores, system_prompt=SYSTEM_PROMPT |
| 45 | + ) |
| 46 | + self.models = models |
| 47 | + |
| 48 | + async def run(self, prompt, test) -> Scores: |
| 49 | + m = [] |
| 50 | + |
| 51 | + for model in self.models: |
| 52 | + chat = Ollama( |
| 53 | + ChatConfig( |
| 54 | + model=model, |
| 55 | + system="Utilize tools when unable to determine a result on your own", |
| 56 | + ) |
| 57 | + ) |
| 58 | + chat.get_tools() |
| 59 | + result = {"model": model, "messages": []} |
| 60 | + async for response in chat.chat(prompt): |
| 61 | + tool = None |
| 62 | + if response.tool is not None: |
| 63 | + tool = {"name": response.tool.name, "input": response.tool.input} |
| 64 | + result["messages"].append( |
| 65 | + { |
| 66 | + "content": response.content, |
| 67 | + "role": response.role, |
| 68 | + "is_error": response._error or False, |
| 69 | + "tool": tool, |
| 70 | + } |
| 71 | + ) |
| 72 | + m.append(result) |
| 73 | + |
| 74 | + data = json.dumps(m) |
| 75 | + |
| 76 | + res = await self.agent.run( |
| 77 | + user_prompt=f"<direction>Analyze the following results for the prompt {prompt}. {test}</direction>\n{data}" |
| 78 | + ) |
| 79 | + return res.data |
| 80 | + |
| 81 | + |
| 82 | +async def main(): |
| 83 | + judge = Judge(models=["llama3.2", "qwen2.5"]) |
| 84 | + res = await judge.run( |
| 85 | + "how many images are there on google.com?", |
| 86 | + "the fetch tool should be used to determine there is only one image on google,com", |
| 87 | + ) |
| 88 | + print(res) |
| 89 | + |
| 90 | + |
| 91 | +if __name__ == "__main__": |
| 92 | + import asyncio |
| 93 | + |
| 94 | + asyncio.run(main()) |
0 commit comments