bq-optimization-agent/utils.py at main · NucleusEngineering/bq-optimization-agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import json
import os
import asyncio
import run_agent
import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

def get_agent_response(prompt: str) -> dict:
  """Gets the agent's response for a given prompt.

  Args:
      prompt: The user's prompt to the agent.

  Returns:
      A dictionary containing the agent's response.
  """
  logger.info(f"Getting agent response for prompt: {prompt}")
  try:
      response = asyncio.run(run_agent.run_conversation(prompt)) # Invoke the agent
      logger.info("Successfully got agent response.")
      return {"response": response}
  except Exception as e:
      logger.error(f"Error getting agent response: {e}", exc_info=True)
      return {"response": "Error: Agent failed to produce a response."}

def save_evaluation_results(eval_result, experiment_run):
  """Processes, saves, and prints the evaluation results for a single run.

  Args:
      eval_result: The evaluation result object.
      experiment_run: The name of the experiment run.
  """
  logger.info(f"Saving evaluation results for run: {experiment_run}")
  os.makedirs("eval_results", exist_ok=True)
  output_file_path = os.path.join("eval_results", f"bq_agent_eval_results_{experiment_run}.json")

  # Prepare data for JSON serialization
  eval_result_dict = {
      'summary_metrics': eval_result.summary_metrics,
      'pointwise_metrics': eval_result.metrics_table.to_dict('records')
  }
   # --- Save the results as a JSON file ---
  with open(output_file_path, "w") as f:
      json.dump(eval_result_dict, f, indent=4)
  logger.info(f"Results for run '{experiment_run}' saved to {output_file_path}")

def print_evaluation_summary(eval_result):
    """Prints a summary of the evaluation results.

    Args:
        eval_result: The evaluation result object.
    """
    logger.info("Printing evaluation summary.")
    pointwise_metrics = eval_result.metrics_table

    # Print summary metrics for the current run
    summary_metrics = eval_result.summary_metrics
    if summary_metrics:
        for key, value in summary_metrics.items():
            metric_name = key.replace('/mean', '').replace('_', ' ').title()
            try:
                logger.info(f"- {metric_name}: {key}: {value:.2f}")
            except (ValueError, TypeError):
                logger.info(f"- {metric_name}: {key}: {value}")
    else:
        logger.info("No summary metrics found for this run.")

    logger.info("\n" + "=" * 50 + "\n")

    if not pointwise_metrics.empty:
        total_questions = len(pointwise_metrics)
        avg_optimization_quality_score = pointwise_metrics['optimization_quality_metric/score'].mean()
        avg_technical_correctness_score = pointwise_metrics['technical_correctness_metric/score'].mean()

        logger.info("--- Aggregated Evaluation Summary ---")
        logger.info(f"Total questions in evaluation dataset: {total_questions}")
        logger.info(f"Average Optimization Quality Score: {avg_optimization_quality_score:.2f}")
        logger.info(f"Average Technical Correctness Score: {avg_technical_correctness_score:.2f}")
        logger.info("\n" + "=" * 50 + "\n")
    else:
        logger.info("\nNo successful evaluation runs were completed.")