Skip to content

Commit 564c494

Browse files
Introduce comprehensive logging and plotting utilities for model evaluation. These tools enable structured metric tracking to CSV and dynamic visualization of performance through generated plots.
Added newline to end of queries.json
1 parent 5600619 commit 564c494

File tree

2 files changed

+215
-1
lines changed

2 files changed

+215
-1
lines changed

tests/tool_calling/queries.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2522,4 +2522,4 @@
25222522
"tool_call": "url_decode"
25232523
}
25242524
]
2525-
}
2525+
}

tests/tool_calling/src/utils.py

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
import os
2+
import math
3+
import matplotlib.pyplot as plt
4+
from datetime import datetime
5+
from pathlib import Path
6+
import numpy as np
7+
import logging
8+
import csv
9+
import pandas as pd
10+
11+
12+
def setup_logger(logger_name="tool_test"):
13+
"""Set up logging for tests."""
14+
logger = logging.getLogger(logger_name)
15+
if logger.hasHandlers():
16+
logger.handlers.clear()
17+
18+
logger.setLevel(logging.INFO)
19+
20+
console_handler = logging.StreamHandler()
21+
console_handler.setLevel(logging.INFO)
22+
console_formatter = logging.Formatter('%(message)s')
23+
console_handler.setFormatter(console_formatter)
24+
logger.addHandler(console_handler)
25+
26+
log_dir = Path("logs")
27+
log_dir.mkdir(exist_ok=True)
28+
file_handler = logging.FileHandler(log_dir / f"{logger_name}.log")
29+
file_handler.setLevel(logging.INFO)
30+
file_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
31+
file_handler.setFormatter(file_formatter)
32+
logger.addHandler(file_handler)
33+
34+
return logger
35+
36+
37+
def add_metric(
38+
model: str,
39+
query_id: str,
40+
status: str,
41+
tool_call_match: bool,
42+
inference_not_empty: bool,
43+
expected_tool_call: str = "N/A",
44+
error: str = ""
45+
):
46+
"""Add a metric record to the CSV file."""
47+
results_dir = Path("results")
48+
results_dir.mkdir(exist_ok=True)
49+
50+
metrics_file = results_dir / "metrics.csv"
51+
52+
if not metrics_file.exists():
53+
with open(metrics_file, 'w', newline='') as f:
54+
writer = csv.writer(f)
55+
writer.writerow([
56+
'timestamp',
57+
'model',
58+
'query_id',
59+
'status',
60+
'tool_call_match',
61+
'inference_not_empty',
62+
'expected_tool_call',
63+
'error'
64+
])
65+
66+
with open(metrics_file, 'a', newline='') as f:
67+
writer = csv.writer(f)
68+
writer.writerow([
69+
datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
70+
model,
71+
query_id,
72+
status,
73+
tool_call_match,
74+
inference_not_empty,
75+
expected_tool_call,
76+
error
77+
])
78+
79+
80+
def get_subplot_grid(n):
81+
""" Calculate the number of rows and columns for subplots based on the number of plots. """
82+
cols = math.ceil(math.sqrt(n))
83+
rows = math.ceil(n / cols)
84+
return rows, cols
85+
86+
87+
def save_plot(fig, filename, dpi=300, bbox_inches='tight'):
88+
"""
89+
Save a matplotlib figure as a JPEG file inside the 'results/' folder.
90+
"""
91+
results_dir = 'results/plots/'
92+
os.makedirs(results_dir, exist_ok=True)
93+
94+
if not filename.lower().endswith(('.jpg', '.jpeg')):
95+
filename += '.jpg'
96+
filename = filename.replace(" ", "_")
97+
98+
full_path = os.path.join(results_dir, filename)
99+
file_directory = os.path.dirname(full_path)
100+
os.makedirs(file_directory, exist_ok=True)
101+
102+
fig.savefig(full_path, format='jpeg', dpi=dpi, bbox_inches=bbox_inches)
103+
print(f"Plot saved as JPEG: '{full_path}'")
104+
105+
106+
def add_plot(fig, ax, df, column_name, title, save_filename=True):
107+
""" Add a stacked bar chart to the given axes."""
108+
# Calculate true counts, false counts, and total counts per model
109+
models = df['model'].unique().tolist()
110+
true_counts = df[df[column_name] == True].groupby('model')[column_name].count().reindex(models, fill_value=0)
111+
false_counts = df[df[column_name] == False].groupby('model')[column_name].count().reindex(models, fill_value=0)
112+
models = true_counts.index.tolist()
113+
true_vals = true_counts.values
114+
false_vals = false_counts.values
115+
x = np.arange(len(models))
116+
# Create stacked bar chart
117+
bars_true = ax.bar(x, true_vals, label='True', color='mediumseagreen')
118+
bars_false = ax.bar(x, false_vals, bottom=true_vals, label='False', color='lightgray')
119+
# Add text labels (true - outside top)
120+
for bar in bars_true:
121+
height = bar.get_height()
122+
if height > 0:
123+
ax.text(bar.get_x() + bar.get_width() / 2, height - 1, int(height),
124+
ha='center', va='bottom', fontsize=8, color='white')
125+
# Add text labels (false - inside middle)
126+
for i, bar in enumerate(bars_false):
127+
height = bar.get_height()
128+
if height > 0:
129+
ax.text(bar.get_x() + bar.get_width() / 2, true_vals[i] + height / 2, int(height),
130+
ha='center', va='center', fontsize=8, color='black')
131+
# Customize axes and layout
132+
ax.set_xticks(x)
133+
ax.set_xticklabels(models, rotation=45, ha='right')
134+
ax.set_ylabel('True/False Count')
135+
ax.set_title(title)
136+
ax.legend(loc='upper right')
137+
138+
if save_filename:
139+
save_plot(fig, title)
140+
141+
142+
def add_per_tool_plot(df, column_name, title):
143+
"""
144+
Creates a stacked bar chart showing True/False counts for a given column, grouped by 'expected_tool_call'.
145+
"""
146+
df_filtered = df[df['expected_tool_call'] != 'N/A']
147+
if df_filtered.empty:
148+
print(f"No data available for plotting '{title}' per tool.")
149+
return
150+
151+
tool_names = df_filtered['expected_tool_call'].unique().tolist()
152+
tool_names.sort()
153+
154+
true_counts = df_filtered[df_filtered[column_name] == True].groupby('expected_tool_call')[column_name].count().reindex(tool_names, fill_value=0)
155+
false_counts = df_filtered[df_filtered[column_name] == False].groupby('expected_tool_call')[column_name].count().reindex(tool_names, fill_value=0)
156+
157+
tools = true_counts.index.tolist()
158+
true_vals = true_counts.values
159+
false_vals = false_counts.values
160+
x = np.arange(len(tools))
161+
162+
fig, ax = plt.subplots(figsize=(max(10, len(tools) * 0.8), 6))
163+
164+
bars_true = ax.bar(x, true_vals, label='True', color='mediumseagreen')
165+
bars_false = ax.bar(x, false_vals, bottom=true_vals, label='False', color='lightgray')
166+
167+
for bar in bars_true:
168+
height = bar.get_height()
169+
if height > 0:
170+
ax.text(bar.get_x() + bar.get_width() / 2, height - 1, int(height),
171+
ha='center', va='bottom', fontsize=8, color='white')
172+
173+
for i, bar in enumerate(bars_false):
174+
height = bar.get_height()
175+
if height > 0:
176+
ax.text(bar.get_x() + bar.get_width() / 2, true_vals[i] + height / 2, int(height),
177+
ha='center', va='center', fontsize=8, color='black')
178+
179+
ax.set_xticks(x)
180+
ax.set_xticklabels(tools, rotation=90, ha='right', fontsize=9)
181+
ax.set_ylabel('Count')
182+
ax.set_title(title)
183+
ax.legend(loc='upper right')
184+
185+
plt.tight_layout()
186+
plt.show()
187+
save_plot(fig, title)
188+
189+
190+
def get_analysis_plots():
191+
file_path = './results/metrics.csv'
192+
try:
193+
df = pd.read_csv(file_path)
194+
except FileNotFoundError:
195+
print(f"Metrics file not found at {file_path}. Cannot generate plots.")
196+
return
197+
except pd.errors.EmptyDataError:
198+
print(f"Metrics file at {file_path} is empty. Cannot generate plots.")
199+
return
200+
201+
print(f"\n=== Generating plots ===")
202+
203+
add_per_tool_plot(df, column_name='tool_call_match', title='Tool Call Match Per Function/Tool')
204+
add_per_tool_plot(df, column_name='inference_not_empty', title='Inference Not Empty Per Function/Tool')
205+
206+
fig, ax = plt.subplots(figsize=(8, 6))
207+
add_plot(fig, ax, df, column_name='tool_call_match', title='Overall comparison check of correct tool call')
208+
plt.tight_layout()
209+
plt.show()
210+
211+
fig, ax = plt.subplots(figsize=(8, 6))
212+
add_plot(fig, ax, df, column_name='inference_not_empty', title='Overall comparison check of inference not empty')
213+
plt.tight_layout()
214+
plt.show()

0 commit comments

Comments
 (0)