openai · sofiabod · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 22, 2026
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,8 @@ data/manifest.json
 data/docs_selected.jsonl
 .mypy_cache/
 .venv
-logs/
+logs/
+results.tsv
+run.log
+notes.md
+autoresearch-ref/
diff --git a/autoresearch-ref b/autoresearch-ref
diff --git a/modal_train.py b/modal_train.py
@@ -0,0 +1,85 @@
+# modal launcher for parameter-golf autoresearch.
+#
+# usage:
+#     modal run modal_train.py
+#
+# custom env vars:
+#     modal run modal_train.py --env "ITERATIONS=5000,VAL_LOSS_EVERY=200"
+
+import modal
+
+app = modal.App("parameter-golf")
+
+# base image with deps + cached data + local train_gpt.py mounted
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "numpy",
+        "tqdm",
+        "torch==2.10",
+        "huggingface-hub",
+        "setuptools",
+        "typing-extensions==4.15.0",
+        "datasets",
+        "tiktoken",
+        "sentencepiece",
+        "zstandard",
+    )
+    .apt_install("git")
+    .run_commands(
+        "git clone https://github.com/openai/parameter-golf.git /opt/parameter-golf",
+        "cd /opt/parameter-golf && python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 80",
+    )
+    # mount local train_gpt.py so agent edits get picked up each run
+    .add_local_file("train_gpt.py", "/opt/parameter-golf/train_gpt.py")
+)
+
+
+@app.function(
+    image=image,
+    gpu="H100:8",
+    timeout=3600,
+)
+def train(env_overrides: dict[str, str] | None = None):
+    """8xh100 training"""
+    import os
+    import subprocess
+
+    # try to install flash-attn at runtime (may timeout)
+    subprocess.run(
+        ["pip", "install", "flash-attn", "--no-build-isolation", "-q"],
+        capture_output=True, timeout=120,
+    )
+
+    os.chdir("/opt/parameter-golf")
+
+    env = os.environ.copy()
+    env.update({
+        "DATA_PATH": "./data/datasets/fineweb10B_sp1024",
+        "TOKENIZER_PATH": "./data/tokenizers/fineweb_1024_bpe.model",
+        "VOCAB_SIZE": "1024",
+        "RUN_ID": "modal_run",
+    })
+    if env_overrides:
+        env.update(env_overrides)
+
+    result = subprocess.run(
+        ["torchrun", "--standalone", "--nproc_per_node=8", "train_gpt.py"],
+        env=env,
+    )
+    return result.returncode
+
+
+@app.local_entrypoint()
+def main(
+    env: str = "",
+):
+    env_overrides = {}
+    if env:
+        for e in env.split(","):
+            k, v = e.split("=", 1)
+            env_overrides[k] = v
+
+    print("launching 8xh100 training...")
+    rc = train.remote(env_overrides or None)
+    print(f"training finished with exit code: {rc}")
diff --git a/program.md b/program.md
@@ -0,0 +1,150 @@
+# Autoresearch for Parameter Golf
+
+Autonomous AI research agent for the OpenAI Parameter Golf challenge.
+
+## Setup
+
+To set up a new experiment, work with the user to:
+
+1. **Agree on a run tag**: Propose a tag based on today's date (e.g. `mar18`). The branch `autoresearch/<tag>` must not already exist.
+2. **Create the branch**: `git checkout -b autoresearch/<tag>` from current main.
+3. **Read the in-scope files**:
+   - `README.md` — Challenge rules
+   - `train_gpt.py` — The file you modify. Model, optimizer, training loop.
+4. **Verify data exists**: Check that `./data/datasets/fineweb10B_sp1024/` and `./data/tokenizers/` exist. If not, tell the human to run `python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10`
+5. **Initialize results.tsv**: Create with just the header row.
+6. **Confirm and go**.
+
+Once you get confirmation, kick off the experimentation.
+
+## Experimentation
+
+Each experiment runs on 8xH100 via Modal. Launch it as:
+
+```
+modal run modal_train.py > run.log 2>&1
+```
+
+The Modal script mounts your local `train_gpt.py`, so your edits are picked up each run automatically.
+
+**What you CAN do:**
+- Modify `train_gpt.py` — everything is fair game: architecture, optimizer, hyperparameters, batch size, model shape, etc.
+
+**What you CANNOT do:**
+- **NEVER push to GitHub. NEVER run `git push`. All work stays local.**
+- Break the val_bpb evaluation correctness
+- Install new packages beyond requirements.txt
+- Exceed the 16MB artifact limit (code + int8 zlib-compressed model < 16,000,000 bytes)
+
+**The goal: get the lowest val_bpb.** Current SOTA is 1.2244. The artifact must stay under 16MB.
+
+**The first run**: Always establish the baseline first — run train_gpt.py as-is.
+
+## Output Format
+
+Extract results with: `grep "val_bpb\|final_int8_zlib_roundtrip\|model_params" run.log`
+
+If grep is empty, the run crashed or Modal failed. Run `tail -n 50 run.log` to read the error.
+
+## Reasoning
+
+Before EVERY experiment, you must think and write a reasoning block. No blind changes.
+
+```
+=== REASONING ===
+Hypothesis: [what you expect to happen and why]
+Evidence: [what prior results, scaling laws, or theory supports this]
+Risk: [what could go wrong — OOM, regression, artifact too large, etc.]
+===
+```
+
+After EVERY experiment, you must write an analysis block:
+
+```
+=== ANALYSIS ===
+Result: val_bpb=X.XXXX artifact=X.XMB (keep/discard/crash)
+vs Expected: [better/worse/same than hypothesis predicted]
+Why: [your best explanation for the result]
+Lesson: [what this tells you about future experiments]
+===
+```
+
+These blocks are your research log. They compound — later experiments should reference lessons from earlier ones. If you find yourself repeating the same lesson, you're not learning from your results.
+
+## Logging
+
+Log every run to `results.tsv` (tab-separated). Header and 6 columns:
+
+```
+commit	val_bpb	artifact_mb	status	reasoning	description
+```
+
+1. Git commit hash (short, 7 chars)
+2. val_bpb (use 0.000000 for crashes)
+3. Artifact size in MB (use 0.0 for crashes)
+4. Status: `keep`, `discard`, or `crash`
+5. One-line reasoning (the hypothesis, condensed)
+6. Short description of the change
+
+Do not commit results.tsv — leave it untracked.
+
+Additionally, maintain a `notes.md` file (also untracked). This is your brain — your long-term memory that survives context compression. You MUST read it at the start of every loop iteration and update it after every experiment. Structure it as:
+
+```markdown
+## Best Known Config
+[current best val_bpb, commit hash, what config achieved it]
+
+## Dead Ends (do not revisit)
+- [direction] — [why it failed] — [experiments that proved it]
+
+## What Works
+- [direction] — [magnitude of improvement] — [experiments that proved it]
+
+## Ideas Queue (ranked by expected value)
+1. [next thing to try and why]
+2. ...
+
+## Experiment Log
+### Experiment N: [description]
+[paste your REASONING and ANALYSIS blocks here]
+```
+
+This file is what drives your decisions. If you're not reading it, you're flying blind.
+
+## Backtracking
+
+Not every path leads somewhere. Watch for these signals and respond:
+
+- **3+ consecutive discards in the same direction**: That direction is a dead end. Abandon it, note it in notes.md, move on to something completely different.
+- **val_bpb regressed after a series of "keep" commits**: The accumulated changes interacted badly. Backtrack:
+  1. Find the best commit hash from results.tsv
+  2. `git reset --hard <commit>`
+  3. Log a row with `status=backtrack` in results.tsv
+  4. Note in notes.md what went wrong and why
+  5. Try a different approach from that known-good state
+- **Stuck in a plateau (5+ experiments with <0.001 improvement)**: Step back. Re-read train_gpt.py from scratch. Look for something structural you've been overlooking. Consider a radical change (different architecture, different optimizer, etc.)
+
+## The Experiment Loop
+
+LOOP FOREVER:
+
+1. **Review (MANDATORY)**: You MUST read `results.tsv` and `notes.md` before every experiment. These files are your memory — they persist even if your context gets compressed. Run `cat results.tsv` and `cat notes.md` and use them to decide what to do next. Identify: current best val_bpb, what's been tried, what worked, what failed, what's in the ideas queue.
+2. **Reason**: Write the REASONING block. No skipping this. Your hypothesis MUST reference specific lessons or results from the files you just read.
+3. **Implement**: Modify `train_gpt.py`.
+4. **Commit**: `git commit` the change.
+5. **Run**: `modal run modal_train.py > run.log 2>&1` (redirect everything — do NOT flood context)
+6. **Extract**: `grep "val_bpb\|final_int8_zlib_roundtrip\|model_params" run.log`
+7. **Analyze**: Write the ANALYSIS block. No skipping this either.
+8. **Log**: Record in results.tsv and append to notes.md.
+9. **Decide**:
+   - val_bpb improved AND artifact < 16MB → **keep** the commit
+   - val_bpb worse or artifact too large → **discard**: `git reset --hard HEAD~1`
+   - crash → attempt trivial fix or discard and move on
+10. **Check for backtracking signals** (see above).
+11. **Loop**.
+
+**Crashes**: If it's a trivial fix (typo, missing import), fix and retry. If fundamentally broken, discard and move on.
+
+**Timeout**: If a run exceeds 15 minutes, kill it and treat as failure.
+
+**NEVER STOP**: Do not pause to ask the human if you should continue. The human might be asleep. You are autonomous. If you run out of ideas, re-read the code, re-analyze results.tsv for patterns, try combining near-misses, try radical changes. Consult notes.md for your ideas queue. The loop runs until the human interrupts you.
diff --git a/records/track_10min_16mb/2026-03-26_OrderAdaptive_9gram_Prefill/README.md b/records/track_10min_16mb/2026-03-26_OrderAdaptive_9gram_Prefill/README.md
@@ -0,0 +1,75 @@
+# Record: Order-Adaptive 9-gram Backoff + Distributed Prefill — val_bpb 0.4405 (3-seed mean)
+
+## Results
+
+| Seed | val_bpb | Artifact | Eval time |
+|------|---------|----------|-----------|
+| 42 | 0.4429 | 14,899,126 bytes | ~586s |
+| 1337 | 0.4381 | 14,740,261 bytes | ~588s |
+| 2024 | 0.4405 | 15,101,371 bytes | ~502s |
+| **Mean** | **0.4405** | | |
+| **Std** | **0.0024** | | |
+
+- Artifact: < 16,000,000 bytes (all seeds)
+- Train: 600s on 8xH100 SXM
+- Eval: < 600s (all seeds)
+
+## Method
+
+11-layer transformer (512d, 8/8 full MHA, XSA-all, LeakyReLU(0.5)², 3.5x MLP).
+Order-adaptive entropy-gated 9-gram backoff cache with per-order entropy thresholds
+and distributed cache prefill. Score-first, backward-looking, deterministic.
+
+### Architecture
+- 11L, 512d, full MHA 8/8, MLP 3.5x (1792), LeakyReLU(0.5)²
+- XSA on all 11 layers, partial RoPE 16/64
+- BigramHash(4096, 128d), SmearGate, VE128 on layers 9-10
+- Tied embeddings, logit softcap 30
+- EMA(0.997) + Tight SWA, Parallel Muon optimizer
+- int5 per-row quantization + zstd-22 compression
+- Early QAT (threshold 0.5)
+
+### Eval-time N-gram Cache
+- Multi-order backoff, orders 2-9, 4M hash buckets per order
+- Dual hash tables per order: context counts + full (context+target) counts
+- Per-order entropy thresholds: {9: 2.6, 8: 2.8, 7: 3.0, 6: 3.2, 5: 3.5, 4: 3.8, 3: 4.2, 2: 4.5}
+- Entropy-adaptive alpha: 0.05 + 0.55 * sigmoid(2.0 * (H - threshold))
+- Alpha range [0.05, 0.60]: low entropy = trust neural, high entropy = trust n-gram
+- min_count=2, score-first (lookup then update per window)
+- Distributed prefill: each rank pre-warms cache with all preceding token positions
+- Sliding window eval with stride=32
+
+### Key Insight
+Distributed cache prefill is critical — without it, ranks 1-7 start with cold caches,
+losing ~60% of n-gram effectiveness. Prefill makes distributed eval equivalent to
+single-GPU sequential eval. Combined with 9-gram orders (capturing longer repeated
+phrases) and per-order entropy gating (trusting higher orders at lower uncertainty),
+this produces a -0.69 BPB gain over neural-only sliding window eval.
+
+## Legality
+
+- **Score-first n-gram cache**: Each window batch: (1) lookup cache for predictions,
+  (2) compute blended loss, (3) update cache with window tokens. Cache only uses
+  backward-looking tokens that have already been scored. No future data access.
+- **Alpha depends on model entropy only**: The mixing weight uses the neural model's
+  output entropy, not the target token. No oracle/hindsight selection.
+- **No TTT**: Test-time training is disabled (TTT_EPOCHS=0).
+- **No GPTQ at eval time**: Quantization completes within the training budget.
+- **No reordering**: Evaluation set processed in original sequential order.
+- **Deterministic**: Given the same seed, produces identical results.
+
+## Acknowledgments
+
+Huge thanks to the incredible community:
+
+- @abaybektursun (PR #549) — base architecture + Legal TTT + Parallel Muon
+- @deanbrr (PR #659, #779) — invented the n-gram eval cache, BackoffNgramMixer
+- @Asukabot0 (PR #715, #727) — entropy-adaptive alpha formula
+- @Robby955 (PR #796) — distributed cache prefill technique
+- @hypery11 (PR #788, #795, #825) — order-adaptive entropy gating, 9-gram extension
+- @newjordan (PR #753, #782) — multi-order backoff, per-order alpha scaling
+- @travispchen (PR #798) — per-order entropy thresholds
+- @gowtham0992 (PR #606) — int5 + QAT
+- @signalrush (PR #414) — EMA training recipe
+- @thwu1 (PR #180) — mixed quantization, BigramHash, SmearGate
+- @raahilshah (PR #162) — int6 quantization foundation
diff --git a/records/track_10min_16mb/2026-03-26_OrderAdaptive_9gram_Prefill/submission.json b/records/track_10min_16mb/2026-03-26_OrderAdaptive_9gram_Prefill/submission.json
@@ -0,0 +1,11 @@
+{
+  "author": "sofiabod",
+  "github_id": "sofiabod",
+  "name": "Order-Adaptive 9-gram Backoff + Distributed Prefill",
+  "blurb": "9-gram backoff with per-order entropy thresholds and distributed cache prefill on 11L MHA transformer with int5 quantization",
+  "date": "2026-03-26",
+  "val_loss": 0.7437,
+  "val_bpb": 0.4405,
+  "bytes_total": 14899126,
+  "bytes_code": 86210
+}