-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathconfig.example.yaml
More file actions
192 lines (167 loc) · 7.63 KB
/
config.example.yaml
File metadata and controls
192 lines (167 loc) · 7.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# HiddenBench Configuration File
# ================================
# Copy this file to 'config.yaml' and fill in your API keys.
# IMPORTANT: Never commit config.yaml to version control - it contains secrets!
#
# To use this configuration:
# 1. Copy this file: cp config.example.yaml config.yaml
# 2. Edit config.yaml and add your API keys
# 3. Run: hiddenbench run --config config.yaml
# ============================================================================
# PROVIDER CONFIGURATION
# ============================================================================
# Configure the LLM providers you want to use. You only need to configure
# the providers you plan to use - leave others commented out or remove them.
providers:
# --------------------------------------------------------------------------
# Anthropic (Claude models)
# --------------------------------------------------------------------------
# Get your API key from: https://console.anthropic.com/
anthropic:
api_key: "your-anthropic-api-key-here"
#
# Current models (Claude 4.x family):
# - claude-opus-4-5-20251101 (Most capable, highest cost)
# - claude-sonnet-4-5-20250929 (Best balance of capability and cost)
# - claude-haiku-4-5-20251001 (Fastest, lowest cost)
#
# Legacy models (still available):
# - claude-3-haiku-20240307 (Training cutoff: Aug 2023 - predates HiddenBench paper)
#
# Note: Claude 3.5 Sonnet and Claude 3.7 Sonnet have been retired.
# For reproducing results with a model that predates the HiddenBench paper
# (published May 2025), use claude-3-haiku-20240307 which has a training
# data cutoff of August 2023.
#
default_model: "claude-sonnet-4-5-20250929"
# Optional: Override the base URL (for proxies or custom endpoints)
# base_url: "https://api.anthropic.com"
# Optional: Request timeout in seconds
# timeout: 60
# Token estimation parameters (used to estimate costs before running)
# These are calibrated for Claude models based on actual usage (~662K in, ~4K out per task)
est_base_input_tokens: 1500 # Base scenario + system prompt per call
est_output_tokens: 60 # Average output tokens per response
est_context_growth: 280 # Tokens added to context per discussion message
# --------------------------------------------------------------------------
# OpenAI (GPT models)
# --------------------------------------------------------------------------
# Get your API key from: https://platform.openai.com/api-keys
openai:
api_key: "your-openai-api-key-here"
# Available models: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, gpt-3.5-turbo
default_model: "gpt-4o"
# Optional: Organization ID
# organization: "org-xxx"
# Optional: Override the base URL (for Azure OpenAI or proxies)
# base_url: "https://api.openai.com/v1"
# Optional: Request timeout in seconds
# timeout: 60
# Token estimation parameters (adjust based on observed usage)
est_base_input_tokens: 1500 # GPT models tend to have more compact prompts
est_output_tokens: 80 # Average output tokens per response
est_context_growth: 300 # Tokens added per discussion message
# --------------------------------------------------------------------------
# Grok (xAI models)
# --------------------------------------------------------------------------
# Get your API key from: https://console.x.ai/
grok:
api_key: "your-grok-api-key-here"
# Available models: grok-beta, grok-2
default_model: "grok-beta"
# Optional: Override the base URL
# base_url: "https://api.x.ai/v1"
# Optional: Request timeout in seconds
# timeout: 60
# Token estimation parameters (adjust based on observed usage)
est_base_input_tokens: 1500 # Base scenario + system prompt per call
est_output_tokens: 80 # Average output tokens per response
est_context_growth: 300 # Tokens added per discussion message
# --------------------------------------------------------------------------
# Local Llama (via llama-cpp-python)
# --------------------------------------------------------------------------
# For running open-weight models locally on your machine.
# Recommended for Macs with Apple Silicon (M1/M2/M3/M4).
#
# Setup instructions:
# 1. Install with local support: pip install hiddenbench[local]
# 2. Download a GGUF model file (e.g., from HuggingFace)
# 3. Set the model_path below to point to your .gguf file
#
# Recommended models for Mac:
# - Llama 3.2 3B: Good balance of speed and capability
# - Llama 3.1 8B: Better reasoning, requires more RAM (~8GB)
# - Mistral 7B: Alternative with good performance
#
# Download from: https://huggingface.co/models?search=gguf
local:
# Path to your GGUF model file
model_path: "/path/to/your/model.gguf"
# Number of GPU layers to offload (higher = faster, but uses more VRAM)
# For Apple Silicon, set to -1 to use all layers on GPU
n_gpu_layers: -1
# Context window size (affects memory usage)
n_ctx: 4096
# Number of threads to use (0 = auto-detect)
n_threads: 0
# Optional: Verbose logging from llama.cpp
# verbose: false
# Token estimation (local models vary widely, these are conservative defaults)
est_base_input_tokens: 1000
est_output_tokens: 100
est_context_growth: 250
# ============================================================================
# BENCHMARK CONFIGURATION
# ============================================================================
benchmark:
# Which provider to use for running the benchmark
# Options: anthropic, openai, grok, local
provider: "anthropic"
# Which model to use (overrides the provider's default_model)
# Leave commented to use the provider's default
# model: "claude-sonnet-4-5-20250929"
# Number of agents in each Hidden Profile scenario (default: 4, as per paper)
num_agents: 4
# Number of discussion rounds (default: 15, as per paper)
num_rounds: 15
# Temperature for LLM responses (lower = more deterministic)
temperature: 0.7
# Maximum tokens per response
max_tokens: 500
# --------------------------------------------------------------------------
# Data Sources
# --------------------------------------------------------------------------
# HiddenBench supports two data sources:
# 1. Official HiddenBench data (65 tasks from the paper)
# 2. Custom tasks you create
# Path to official HiddenBench data (downloaded from HuggingFace)
# Default: ./data/hiddenbench_official
data_dir: "./data/hiddenbench_official"
# Path to custom tasks directory
# Default: ./tasks
tasks_dir: "./tasks"
# Which data sources to use
use_official_data: true # Use the 65 official HiddenBench tasks
use_custom_tasks: true # Also include any custom tasks from tasks_dir
# Path to output reports directory (default: ./reports)
reports_dir: "./reports"
# Whether to run Full Profile baseline (recommended for comparison)
run_full_profile: true
# Random seed for reproducibility (optional)
# seed: 42
# ============================================================================
# ADVANCED OPTIONS
# ============================================================================
advanced:
# Retry configuration for API calls
max_retries: 3
retry_delay: 1.0 # seconds
# Rate limiting (requests per minute, 0 = no limit)
rate_limit: 0
# Logging level: DEBUG, INFO, WARNING, ERROR
log_level: "INFO"
# Save intermediate results during benchmark run
save_intermediate: true
# Parallel task execution (number of concurrent tasks)
# Set to 1 for sequential execution
parallel_tasks: 1