hiddenbench/config.example.yaml at main · jonradoff/hiddenbench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# HiddenBench Configuration File
# ================================
# Copy this file to 'config.yaml' and fill in your API keys.
# IMPORTANT: Never commit config.yaml to version control - it contains secrets!
#
# To use this configuration:
#   1. Copy this file: cp config.example.yaml config.yaml
#   2. Edit config.yaml and add your API keys
#   3. Run: hiddenbench run --config config.yaml

# ============================================================================
# PROVIDER CONFIGURATION
# ============================================================================
# Configure the LLM providers you want to use. You only need to configure
# the providers you plan to use - leave others commented out or remove them.

providers:
  # --------------------------------------------------------------------------
  # Anthropic (Claude models)
  # --------------------------------------------------------------------------
  # Get your API key from: https://console.anthropic.com/
  anthropic:
    api_key: "your-anthropic-api-key-here"
    #
    # Current models (Claude 4.x family):
    #   - claude-opus-4-5-20251101    (Most capable, highest cost)
    #   - claude-sonnet-4-5-20250929  (Best balance of capability and cost)
    #   - claude-haiku-4-5-20251001   (Fastest, lowest cost)
    #
    # Legacy models (still available):
    #   - claude-3-haiku-20240307     (Training cutoff: Aug 2023 - predates HiddenBench paper)
    #
    # Note: Claude 3.5 Sonnet and Claude 3.7 Sonnet have been retired.
    # For reproducing results with a model that predates the HiddenBench paper
    # (published May 2025), use claude-3-haiku-20240307 which has a training
    # data cutoff of August 2023.
    #
    default_model: "claude-sonnet-4-5-20250929"
    # Optional: Override the base URL (for proxies or custom endpoints)
    # base_url: "https://api.anthropic.com"
    # Optional: Request timeout in seconds
    # timeout: 60
    # Token estimation parameters (used to estimate costs before running)
    # These are calibrated for Claude models based on actual usage (~662K in, ~4K out per task)
    est_base_input_tokens: 1500    # Base scenario + system prompt per call
    est_output_tokens: 60          # Average output tokens per response
    est_context_growth: 280        # Tokens added to context per discussion message

  # --------------------------------------------------------------------------
  # OpenAI (GPT models)
  # --------------------------------------------------------------------------
  # Get your API key from: https://platform.openai.com/api-keys
  openai:
    api_key: "your-openai-api-key-here"
    # Available models: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, gpt-3.5-turbo
    default_model: "gpt-4o"
    # Optional: Organization ID
    # organization: "org-xxx"
    # Optional: Override the base URL (for Azure OpenAI or proxies)
    # base_url: "https://api.openai.com/v1"
    # Optional: Request timeout in seconds
    # timeout: 60
    # Token estimation parameters (adjust based on observed usage)
    est_base_input_tokens: 1500    # GPT models tend to have more compact prompts
    est_output_tokens: 80          # Average output tokens per response
    est_context_growth: 300        # Tokens added per discussion message

  # --------------------------------------------------------------------------
  # Grok (xAI models)
  # --------------------------------------------------------------------------
  # Get your API key from: https://console.x.ai/
  grok:
    api_key: "your-grok-api-key-here"
    # Available models: grok-beta, grok-2
    default_model: "grok-beta"
    # Optional: Override the base URL
    # base_url: "https://api.x.ai/v1"
    # Optional: Request timeout in seconds
    # timeout: 60
    # Token estimation parameters (adjust based on observed usage)
    est_base_input_tokens: 1500    # Base scenario + system prompt per call
    est_output_tokens: 80          # Average output tokens per response
    est_context_growth: 300        # Tokens added per discussion message

  # --------------------------------------------------------------------------
  # Local Llama (via llama-cpp-python)
  # --------------------------------------------------------------------------
  # For running open-weight models locally on your machine.
  # Recommended for Macs with Apple Silicon (M1/M2/M3/M4).
  #
  # Setup instructions:
  #   1. Install with local support: pip install hiddenbench[local]
  #   2. Download a GGUF model file (e.g., from HuggingFace)
  #   3. Set the model_path below to point to your .gguf file
  #
  # Recommended models for Mac:
  #   - Llama 3.2 3B: Good balance of speed and capability
  #   - Llama 3.1 8B: Better reasoning, requires more RAM (~8GB)
  #   - Mistral 7B: Alternative with good performance
  #
  # Download from: https://huggingface.co/models?search=gguf
  local:
    # Path to your GGUF model file
    model_path: "/path/to/your/model.gguf"
    # Number of GPU layers to offload (higher = faster, but uses more VRAM)
    # For Apple Silicon, set to -1 to use all layers on GPU
    n_gpu_layers: -1
    # Context window size (affects memory usage)
    n_ctx: 4096
    # Number of threads to use (0 = auto-detect)
    n_threads: 0
    # Optional: Verbose logging from llama.cpp
    # verbose: false
    # Token estimation (local models vary widely, these are conservative defaults)
    est_base_input_tokens: 1000
    est_output_tokens: 100
    est_context_growth: 250

# ============================================================================
# BENCHMARK CONFIGURATION
# ============================================================================

benchmark:
  # Which provider to use for running the benchmark
  # Options: anthropic, openai, grok, local
  provider: "anthropic"

  # Which model to use (overrides the provider's default_model)
  # Leave commented to use the provider's default
  # model: "claude-sonnet-4-5-20250929"

  # Number of agents in each Hidden Profile scenario (default: 4, as per paper)
  num_agents: 4

  # Number of discussion rounds (default: 15, as per paper)
  num_rounds: 15

  # Temperature for LLM responses (lower = more deterministic)
  temperature: 0.7

  # Maximum tokens per response
  max_tokens: 500

  # --------------------------------------------------------------------------
  # Data Sources
  # --------------------------------------------------------------------------
  # HiddenBench supports two data sources:
  # 1. Official HiddenBench data (65 tasks from the paper)
  # 2. Custom tasks you create

  # Path to official HiddenBench data (downloaded from HuggingFace)
  # Default: ./data/hiddenbench_official
  data_dir: "./data/hiddenbench_official"

  # Path to custom tasks directory
  # Default: ./tasks
  tasks_dir: "./tasks"

  # Which data sources to use
  use_official_data: true   # Use the 65 official HiddenBench tasks
  use_custom_tasks: true    # Also include any custom tasks from tasks_dir

  # Path to output reports directory (default: ./reports)
  reports_dir: "./reports"

  # Whether to run Full Profile baseline (recommended for comparison)
  run_full_profile: true

  # Random seed for reproducibility (optional)
  # seed: 42

# ============================================================================
# ADVANCED OPTIONS
# ============================================================================

advanced:
  # Retry configuration for API calls
  max_retries: 3
  retry_delay: 1.0  # seconds

  # Rate limiting (requests per minute, 0 = no limit)
  rate_limit: 0

  # Logging level: DEBUG, INFO, WARNING, ERROR
  log_level: "INFO"

  # Save intermediate results during benchmark run
  save_intermediate: true

  # Parallel task execution (number of concurrent tasks)
  # Set to 1 for sequential execution
  parallel_tasks: 1