Skip to content

Commit fc1097d

Browse files
committed
init
0 parents  commit fc1097d

File tree

6 files changed

+1177
-0
lines changed

6 files changed

+1177
-0
lines changed

Diff for: .gitignore

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Python-generated files
2+
__pycache__/
3+
*.py[oc]
4+
build/
5+
dist/
6+
wheels/
7+
*.egg-info
8+
9+
# Virtual environments
10+
.venv

Diff for: .python-version

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.12

Diff for: README.md

Whitespace-only changes.

Diff for: mcpx_eval.py

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import json
2+
from typing import List
3+
4+
from mcpx_pydantic_ai import BaseModel, Agent, Field
5+
from mcpx_py import Ollama, ChatConfig
6+
7+
8+
class Score(BaseModel):
9+
"""
10+
Used to score the result of an LLM tool call
11+
"""
12+
13+
model: str = Field("Name of model being scored")
14+
output: str = Field("The literal output of the model being tested")
15+
description: str = Field("Description of results for this model")
16+
accuracy: float = Field("A score of how accurate the response is")
17+
tool_use: float = Field("A score of how appropriate the tool use is")
18+
overall: float = Field("An overall qualitative score of the response")
19+
20+
21+
class Scores(BaseModel):
22+
scores: List[Score] = Field("A list of scores for each model")
23+
24+
25+
SYSTEM_PROMPT = """
26+
You are an large language model evaluator, you are an expert at comparing the output of various models based on
27+
accuracy, tool use and overall quality of the output.
28+
29+
- All numeric responses should be scored from 0.0 - 100.0, where 100 is the best score and 0 is the worst
30+
- Additional direction for each evaluation may be marked in the input between <direction></direction> tags
31+
- The tool use score should be based on whether or not the correct tool was used and whether the minimum amount
32+
of tools were used to accomplish a task. Over use of tools or repeated use of tools should deduct points from
33+
this score.
34+
- The accuracy score should reflect the accuracy of the result generally and taking into account the <direction> block
35+
- The overall score should reflect the overall quality of the output
36+
"""
37+
38+
39+
class Judge:
40+
def __init__(self, models: List[str] | None = None):
41+
if models is None:
42+
models = []
43+
self.agent = Agent(
44+
"claude-3-5-sonnet-latest", result_type=Scores, system_prompt=SYSTEM_PROMPT
45+
)
46+
self.models = models
47+
48+
async def run(self, prompt, test) -> Scores:
49+
m = []
50+
51+
for model in self.models:
52+
chat = Ollama(
53+
ChatConfig(
54+
model=model,
55+
system="Utilize tools when unable to determine a result on your own",
56+
)
57+
)
58+
chat.get_tools()
59+
result = {"model": model, "messages": []}
60+
async for response in chat.chat(prompt):
61+
tool = None
62+
if response.tool is not None:
63+
tool = {"name": response.tool.name, "input": response.tool.input}
64+
result["messages"].append(
65+
{
66+
"content": response.content,
67+
"role": response.role,
68+
"is_error": response._error or False,
69+
"tool": tool,
70+
}
71+
)
72+
m.append(result)
73+
74+
data = json.dumps(m)
75+
76+
res = await self.agent.run(
77+
user_prompt=f"<direction>Analyze the following results for the prompt {prompt}. {test}</direction>\n{data}"
78+
)
79+
return res.data
80+
81+
82+
async def main():
83+
judge = Judge(models=["llama3.2", "qwen2.5"])
84+
res = await judge.run(
85+
"how many images are there on google.com?",
86+
"the fetch tool should be used to determine there is only one image on google,com",
87+
)
88+
print(res)
89+
90+
91+
if __name__ == "__main__":
92+
import asyncio
93+
94+
asyncio.run(main())

Diff for: pyproject.toml

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[project]
2+
name = "mcpx-eval"
3+
version = "0.1.0"
4+
description = "Add your description here"
5+
readme = "README.md"
6+
requires-python = ">=3.12"
7+
dependencies = [
8+
"mcpx-py>=0.1.1",
9+
"mcpx-pydantic-ai",
10+
]
11+
12+
[tool.uv.sources]
13+
mcpx-pydantic-ai = { git = "https://github.com/dylibso/mcpx-pydantic-ai" }
14+
15+
[dependency-groups]
16+
dev = [
17+
"python-lsp-ruff>=2.2.2",
18+
"python-lsp-server>=1.12.2",
19+
"ruff>=0.9.6",
20+
]

0 commit comments

Comments
 (0)