Prompt Evaluation #80
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Prompt Evaluation | |
| on: | |
| push: | |
| branches: [ main, develop ] | |
| paths: | |
| - 'services/ai/prompts/**' | |
| - 'services/ai/src/**' | |
| pull_request: | |
| branches: [ main ] | |
| paths: | |
| - 'services/ai/prompts/**' | |
| - 'services/ai/src/**' | |
| schedule: | |
| # Run evaluation daily at 2 AM UTC | |
| - cron: '0 2 * * *' | |
| workflow_dispatch: | |
| inputs: | |
| prompt_versions: | |
| description: 'Comma-separated prompt versions to evaluate (e.g., v1,v2)' | |
| required: false | |
| default: 'v1,v2' | |
| scenario_set: | |
| description: 'Scenario set to use (basic, complex, edge-cases)' | |
| required: false | |
| default: 'basic' | |
| jobs: | |
| evaluate-prompts: | |
| runs-on: ubuntu-latest | |
| strategy: | |
| matrix: | |
| node-version: [18.x] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: ${{ matrix.node-version }} | |
| cache: 'npm' | |
| - name: Install dependencies | |
| run: | | |
| cd services/ai | |
| npm ci | |
| - name: Run prompt evaluation | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| THERAPIST_PROMPT_VERSION: ${{ github.event.inputs.prompt_versions || 'v1,v2' }} | |
| EVALUATION_SCENARIO_SET: ${{ github.event.inputs.scenario_set || 'basic' }} | |
| run: | | |
| cd services/ai | |
| npm run test:evaluation | |
| - name: Generate evaluation report | |
| if: always() | |
| run: | | |
| cd services/ai | |
| npm run generate-eval-report | |
| - name: Upload evaluation results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: evaluation-results-${{ github.run_number }} | |
| path: | | |
| services/ai/evaluation-results/ | |
| services/ai/eval-reports/ | |
| retention-days: 30 | |
| - name: Comment PR with results | |
| if: github.event_name == 'pull_request' | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| try { | |
| // Read evaluation results | |
| const resultsPath = path.join('services/ai/evaluation-results', 'latest-results.json'); | |
| if (fs.existsSync(resultsPath)) { | |
| const results = JSON.parse(fs.readFileSync(resultsPath, 'utf8')); | |
| // Generate comment | |
| let comment = '## 🤖 Prompt Evaluation Results\n\n'; | |
| results.reports.forEach(report => { | |
| comment += `### ${report.promptVersion}\n`; | |
| comment += `- **Schema Compliance**: ${(report.schemaCompliance * 100).toFixed(1)}%\n`; | |
| comment += `- **Average Quality Score**: ${(report.averageQualityScore * 100).toFixed(1)}%\n`; | |
| comment += `- **Average Latency**: ${report.averageLatency.toFixed(0)}ms\n`; | |
| comment += `- **Success Rate**: ${((report.successfulResponses / report.totalScenarios) * 100).toFixed(1)}%\n\n`; | |
| }); | |
| if (results.comparison) { | |
| comment += `### 🏆 Winner: ${results.comparison.winner}\n`; | |
| comment += `- **Score Difference**: ${(results.comparison.scoreDifference * 100).toFixed(1)}%\n`; | |
| comment += `- **Significant Difference**: ${results.comparison.significantDifference ? 'Yes' : 'No'}\n\n`; | |
| } | |
| comment += '---\n*This comment was automatically generated by the prompt evaluation workflow.*'; | |
| // Post comment | |
| github.rest.issues.createComment({ | |
| issue_number: context.issue.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); | |
| } | |
| } catch (error) { | |
| console.error('Failed to post evaluation results:', error); | |
| } | |
| schema-compliance-check: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: 18.x | |
| cache: 'npm' | |
| - name: Install dependencies | |
| run: | | |
| cd services/ai | |
| npm ci | |
| - name: Run schema compliance tests | |
| run: | | |
| cd services/ai | |
| npm run test:schema-compliance | |
| - name: Check schema compliance threshold | |
| run: | | |
| cd services/ai | |
| npm run check-compliance-threshold | |
| survey-rating-correlation: | |
| runs-on: ubuntu-latest | |
| if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: 18.x | |
| cache: 'npm' | |
| - name: Install dependencies | |
| run: | | |
| cd services/ai | |
| npm ci | |
| - name: Run survey correlation analysis | |
| env: | |
| DATABASE_URL: ${{ secrets.DATABASE_URL }} | |
| run: | | |
| cd services/ai | |
| npm run analyze-survey-correlation | |
| - name: Upload correlation results | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: survey-correlation-${{ github.run_number }} | |
| path: services/ai/correlation-results/ | |
| retention-days: 30 |