-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathutils.py
More file actions
83 lines (70 loc) · 3.16 KB
/
utils.py
File metadata and controls
83 lines (70 loc) · 3.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import json
import os
import asyncio
import run_agent
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
def get_agent_response(prompt: str) -> dict:
"""Gets the agent's response for a given prompt.
Args:
prompt: The user's prompt to the agent.
Returns:
A dictionary containing the agent's response.
"""
logger.info(f"Getting agent response for prompt: {prompt}")
try:
response = asyncio.run(run_agent.run_conversation(prompt)) # Invoke the agent
logger.info("Successfully got agent response.")
return {"response": response}
except Exception as e:
logger.error(f"Error getting agent response: {e}", exc_info=True)
return {"response": "Error: Agent failed to produce a response."}
def save_evaluation_results(eval_result, experiment_run):
"""Processes, saves, and prints the evaluation results for a single run.
Args:
eval_result: The evaluation result object.
experiment_run: The name of the experiment run.
"""
logger.info(f"Saving evaluation results for run: {experiment_run}")
os.makedirs("eval_results", exist_ok=True)
output_file_path = os.path.join("eval_results", f"bq_agent_eval_results_{experiment_run}.json")
# Prepare data for JSON serialization
eval_result_dict = {
'summary_metrics': eval_result.summary_metrics,
'pointwise_metrics': eval_result.metrics_table.to_dict('records')
}
# --- Save the results as a JSON file ---
with open(output_file_path, "w") as f:
json.dump(eval_result_dict, f, indent=4)
logger.info(f"Results for run '{experiment_run}' saved to {output_file_path}")
def print_evaluation_summary(eval_result):
"""Prints a summary of the evaluation results.
Args:
eval_result: The evaluation result object.
"""
logger.info("Printing evaluation summary.")
pointwise_metrics = eval_result.metrics_table
# Print summary metrics for the current run
summary_metrics = eval_result.summary_metrics
if summary_metrics:
for key, value in summary_metrics.items():
metric_name = key.replace('/mean', '').replace('_', ' ').title()
try:
logger.info(f"- {metric_name}: {key}: {value:.2f}")
except (ValueError, TypeError):
logger.info(f"- {metric_name}: {key}: {value}")
else:
logger.info("No summary metrics found for this run.")
logger.info("\n" + "=" * 50 + "\n")
if not pointwise_metrics.empty:
total_questions = len(pointwise_metrics)
avg_optimization_quality_score = pointwise_metrics['optimization_quality_metric/score'].mean()
avg_technical_correctness_score = pointwise_metrics['technical_correctness_metric/score'].mean()
logger.info("--- Aggregated Evaluation Summary ---")
logger.info(f"Total questions in evaluation dataset: {total_questions}")
logger.info(f"Average Optimization Quality Score: {avg_optimization_quality_score:.2f}")
logger.info(f"Average Technical Correctness Score: {avg_technical_correctness_score:.2f}")
logger.info("\n" + "=" * 50 + "\n")
else:
logger.info("\nNo successful evaluation runs were completed.")