Skip to content

Prompt Evaluation

Prompt Evaluation #80

name: Prompt Evaluation
on:
push:
branches: [ main, develop ]
paths:
- 'services/ai/prompts/**'
- 'services/ai/src/**'
pull_request:
branches: [ main ]
paths:
- 'services/ai/prompts/**'
- 'services/ai/src/**'
schedule:
# Run evaluation daily at 2 AM UTC
- cron: '0 2 * * *'
workflow_dispatch:
inputs:
prompt_versions:
description: 'Comma-separated prompt versions to evaluate (e.g., v1,v2)'
required: false
default: 'v1,v2'
scenario_set:
description: 'Scenario set to use (basic, complex, edge-cases)'
required: false
default: 'basic'
jobs:
evaluate-prompts:
runs-on: ubuntu-latest
strategy:
matrix:
node-version: [18.x]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ matrix.node-version }}
cache: 'npm'
- name: Install dependencies
run: |
cd services/ai
npm ci
- name: Run prompt evaluation
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
THERAPIST_PROMPT_VERSION: ${{ github.event.inputs.prompt_versions || 'v1,v2' }}
EVALUATION_SCENARIO_SET: ${{ github.event.inputs.scenario_set || 'basic' }}
run: |
cd services/ai
npm run test:evaluation
- name: Generate evaluation report
if: always()
run: |
cd services/ai
npm run generate-eval-report
- name: Upload evaluation results
if: always()
uses: actions/upload-artifact@v4
with:
name: evaluation-results-${{ github.run_number }}
path: |
services/ai/evaluation-results/
services/ai/eval-reports/
retention-days: 30
- name: Comment PR with results
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');
try {
// Read evaluation results
const resultsPath = path.join('services/ai/evaluation-results', 'latest-results.json');
if (fs.existsSync(resultsPath)) {
const results = JSON.parse(fs.readFileSync(resultsPath, 'utf8'));
// Generate comment
let comment = '## 🤖 Prompt Evaluation Results\n\n';
results.reports.forEach(report => {
comment += `### ${report.promptVersion}\n`;
comment += `- **Schema Compliance**: ${(report.schemaCompliance * 100).toFixed(1)}%\n`;
comment += `- **Average Quality Score**: ${(report.averageQualityScore * 100).toFixed(1)}%\n`;
comment += `- **Average Latency**: ${report.averageLatency.toFixed(0)}ms\n`;
comment += `- **Success Rate**: ${((report.successfulResponses / report.totalScenarios) * 100).toFixed(1)}%\n\n`;
});
if (results.comparison) {
comment += `### 🏆 Winner: ${results.comparison.winner}\n`;
comment += `- **Score Difference**: ${(results.comparison.scoreDifference * 100).toFixed(1)}%\n`;
comment += `- **Significant Difference**: ${results.comparison.significantDifference ? 'Yes' : 'No'}\n\n`;
}
comment += '---\n*This comment was automatically generated by the prompt evaluation workflow.*';
// Post comment
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
}
} catch (error) {
console.error('Failed to post evaluation results:', error);
}
schema-compliance-check:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: 18.x
cache: 'npm'
- name: Install dependencies
run: |
cd services/ai
npm ci
- name: Run schema compliance tests
run: |
cd services/ai
npm run test:schema-compliance
- name: Check schema compliance threshold
run: |
cd services/ai
npm run check-compliance-threshold
survey-rating-correlation:
runs-on: ubuntu-latest
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: 18.x
cache: 'npm'
- name: Install dependencies
run: |
cd services/ai
npm ci
- name: Run survey correlation analysis
env:
DATABASE_URL: ${{ secrets.DATABASE_URL }}
run: |
cd services/ai
npm run analyze-survey-correlation
- name: Upload correlation results
uses: actions/upload-artifact@v4
with:
name: survey-correlation-${{ github.run_number }}
path: services/ai/correlation-results/
retention-days: 30