openai · mikeapedia · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/Shrink/shrink.py b/Shrink/shrink.py
@@ -0,0 +1,133 @@
+import ast
+import subprocess
+import lzma
+import base64
+import os
+import sys
+
+class DeadCodeRemover(ast.NodeTransformer):
+    def visit_If(self, node):
+        source = ast.unparse(node.test)
+
+        # Dead config branches based on defaults — these conditions are False
+        # with default config, so their bodies never execute during competition.
+        # IMPORTANT: Only match the affirmative test, NOT negations like
+        # "not args.load_snapshot" which guards the training loop (always True).
+        dead_false_tests = [
+            "args.load_snapshot",        # snapshot restore (default="" → falsy)
+            "args.snapshot_post_hessian", # snapshot save (default=False)
+        ]
+
+        for dt in dead_false_tests:
+            if source == dt:
+                if node.orelse:
+                    # Cannot safely remove — else/elif branch must be preserved.
+                    # Skip removal and keep the full if/else structure.
+                    break
+                return None
+
+        self.generic_visit(node)
+        return node
+
+def shrink_pipeline(input_file, output_file):
+    print(f"[*] Starting shrinking pipeline for: {input_file}")
+
+    if not os.path.exists(input_file):
+        print(f"[!] Error: Input file '{input_file}' not found.")
+        return
+
+    # 1. Read Original Source
+    with open(input_file, 'r', encoding='utf-8') as f:
+        source = f.read()
+    print(f"[*] Original size: {len(source)} bytes")
+
+    # 2. Prune AST (Dead Code Elimination)
+    print(f"[*] Pruning dead code and evaluation logic from AST...")
+    tree = ast.parse(source)
+    transformer = DeadCodeRemover()
+    tree = transformer.visit(tree)
+    ast.fix_missing_locations(tree)
+
+    pruned_source = ast.unparse(tree)
+    temp_pruned_file = input_file + ".pruned.tmp.py"
+    temp_minified_file = input_file + ".min.tmp.py"
+
+    def _cleanup_temps():
+        for f in [temp_pruned_file, temp_minified_file]:
+            if os.path.exists(f):
+                os.remove(f)
+
+    with open(temp_pruned_file, 'w', encoding='utf-8') as f:
+        f.write(pruned_source)
+
+    # 3. Minify using pyminify
+    print(f"[*] Running pyminify to minimize identifiers and strip whitespace/comments/hints...")
+    try:
+        result = subprocess.run([
+            "uvx", "--from", "python-minifier", "pyminify", temp_pruned_file,
+            "--output", temp_minified_file,
+            "--remove-literal-statements",
+            "--remove-asserts",
+            "--remove-debug",
+            "--remove-class-attribute-annotations"
+        ], check=True, capture_output=True, text=True)
+    except FileNotFoundError:
+        print(f"[!] 'uvx' not found on PATH. Install uv or run: pip install python-minifier")
+        _cleanup_temps()
+        return
+    except subprocess.CalledProcessError as e:
+        print(f"[!] PyMinify failed (exit {e.returncode}):")
+        if e.stderr:
+            print(e.stderr)
+        _cleanup_temps()
+        return
+
+    try:
+        with open(temp_minified_file, 'rb') as f:
+            minified_bytes = f.read()
+        print(f"[*] Minified size: {len(minified_bytes)} bytes")
+
+        # 4. LZMA + Base85 Self-Extracting Compression
+        print(f"[*] Compressing into LZMA Base85 executable wrap...")
+        compressed = lzma.compress(minified_bytes, format=lzma.FORMAT_RAW, filters=[{"id": lzma.FILTER_LZMA2, "preset": 9 | lzma.PRESET_EXTREME}])
+        b85_encoded = base64.b85encode(compressed).decode('ascii')
+
+        # Chunk the b85 string to avoid overly long single lines
+        chunk_size = 100
+        chunks = [b85_encoded[i:i+chunk_size] for i in range(0, len(b85_encoded), chunk_size)]
+        formatted_b85 = '"\n"'.join(chunks)
+
+        header = f"""import lzma as L,base64 as B\nexec(L.decompress(B.b85decode(("{formatted_b85}")),format=L.FORMAT_RAW,filters=[{{"id":L.FILTER_LZMA2}}]))\n"""
+
+        with open(output_file, 'w', encoding='ascii') as f:
+            f.write(header)
+
+        final_size = os.path.getsize(output_file)
+        print(f"[*] Packed output size: {final_size} bytes")
+        print(f"[*] Total Reduction: {((len(source) - final_size) / len(source) * 100):.1f}%")
+        print(f"[*] Success! Optimized submission saved to: {output_file}")
+    finally:
+        # 5. Clean up temporary files (always, even on error)
+        _cleanup_temps()
+
+if __name__ == '__main__':
+    if len(sys.argv) == 3:
+        # Direct mode: shrink.py <input> <output>
+        shrink_pipeline(sys.argv[1], sys.argv[2])
+    elif len(sys.argv) == 1:
+        # Legacy rename-and-shrink mode (first-time setup only)
+        human_file = "train_gpt_human.py"
+        output_file = "train_gpt.py"
+        if not os.path.exists(output_file):
+            print(f"[!] Error: '{output_file}' not found. Use: python shrink.py <input> <output>")
+            sys.exit(1)
+        if os.path.exists(human_file):
+            print(f"[!] Error: '{human_file}' already exists. Use: python shrink.py {human_file} {output_file}")
+            sys.exit(1)
+        os.rename(output_file, human_file)
+        print(f"[*] Renamed '{output_file}' to '{human_file}'")
+        shrink_pipeline(human_file, output_file)
+    else:
+        print(f"Usage: python shrink.py <input_file> <output_file>")
+        print(f"       python shrink.py  (legacy: rename train_gpt.py -> train_gpt_human.py, then shrink)")
+        sys.exit(1)
diff --git a/records/track_10min_16mb/2026-03-28_TurboMuon_EngramLite_ParamBanking/README.md b/records/track_10min_16mb/2026-03-28_TurboMuon_EngramLite_ParamBanking/README.md
@@ -0,0 +1,191 @@
+# Turbo-Muon + EngramLite + Parameter Banking + GPTQ Mixed-Precision
+
+**val_bpb: 1.1091** (3-seed mean, std 0.0005) | **~15.3 MB** | 8xH100 SXM
+
+## Results (8xH100 80GB SXM)
+
+| Seed | step_avg | steps | val_bpb (SW) | val_bpb (full) | Artifact bytes |
+|------|----------|-------|-------------|----------------|----------------|
+| 42   | 93.26ms  | 6284  | 1.1086      | 1.1324         | 15,992,528     |
+| 1337 | 93.11ms  | 6295  | 1.1090      | 1.1328         | 15,993,413     |
+| 2025 | 93.11ms  | 6294  | 1.1096      | 1.1335         | 15,993,904     |
+| **Mean** | **93.16ms** | **6291** | **1.1091** | **1.1329** | |
+
+## Summary
+
+An 11-layer GPT language model combining seven key innovations over the PR #609 baseline, targeting the 16MB artifact budget at MLP 3.5x width. Development-run benchmark: **1.1119 val_bpb (sliding window)** on 1xH100.
+
+## Key Innovations
+
+### Turbo-Muon Optimizer
+
+A variant of the Muon optimizer with three enhancements that reduce Newton-Schulz iterations from 5 to 4:
+
+- **AOL Preconditioning** -- Gershgorin-based diagonal scaling contracts the singular value range before Newton-Schulz iteration, allowing the first NS step to be skipped.
+- **Polar Express Coefficients** -- Optimal degree-5 polynomial coefficients from Amsel et al. (arXiv:2505.16932), applied per-iteration rather than fixed.
+- **Post-NS row_col Normalization** -- After orthogonalization, rows then columns are normalized. This consistently outperforms row-only or no normalization.
+
+### EngramLite Hash Embeddings
+
+Multi-head prime-based hash embedding that captures bigram and trigram statistics without explicit tokenizer changes:
+- 2 heads x 2 orders (bigram + trigram) with 8192 hash buckets
+- Projects to model_dim through a learned sigmoid gate
+- Adds character-level context at minimal parameter cost
+
+### Parameter Banking
+
+All per-layer linear weights stored in contiguous 3D tensors (`qo_bank`, `kv_bank`, `mlp_up_bank`, `mlp_down_bank`). This enables batched Newton-Schulz orthogonalization via `torch.bmm`, dramatically reducing Muon optimizer overhead compared to per-layer iteration.
+
+### U-Net Skip Connections
+
+Encoder/decoder structure with learned sigmoid-gated skip connections. Gates start at `sigmoid(0) = 0.5` and learn per-dimension blending, preventing gradient shortcutting at initialization.
+
+### ValueEmbedding
+
+Reinjects token identity into attention values at deep layers (9, 10). Projects vocabulary embeddings to kv_dim with per-layer learned scaling, helping the model maintain token-level information through deep attention stacks.
+
+### SmearGate
+
+`F.pad`-based causal shift blending each token with its predecessor, providing free unigram context at zero attention cost.
+
+### XSA (Cross-Sequence Attention) — All Layers
+
+Efficient XSA applied to all 11 layers (`XSA_LAST_N=11`). Subtracts the self-value projection from attention output via GQA-aware reshape (no `repeat_interleave`), encouraging the model to attend to context rather than the current token's own representation.
+
+### ASQU v3 Per-Layer Activation Slopes
+
+Fixed per-layer LeakyReLU negative slopes discovered through 3 rounds of Adaptive Slope parameter tuning:
+`[-0.014, 0.131, 0.225, 0.265, 0.310, 0.354, 0.421, 0.429, 0.417, 0.358, 0.468]`
+
+Layer 0 uses near-ReLU² (slope ≈ 0) for aggressive feature selection, while deeper layers use progressively larger negative slopes (up to 0.468) to allow smoother gradient flow. Hard-coded converged endpoints eliminate learnable slope parameters.
+
+### Mimetic V-O Initialization
+
+Output projections initialized as `O_h = -alpha * V_h` per head (alpha=0.05), creating a small residual-like identity at init for improved early training stability.
+
+### Additional Architecture Details
+
+- **Partial RoPE** — Rotary position embeddings applied to only 16 of 64 head dimensions (`ROPE_DIMS=16`). Remaining dimensions are position-free, giving the model both positional and position-invariant feature channels.
+- **LN Scale** — Layer norm outputs scaled by `1/sqrt(layer_idx + 1)`, stabilizing deeper layers by reducing activation magnitudes proportional to depth.
+- **Logit Softcap** — `softcap * tanh(logits / softcap)` with softcap=30.0 prevents extreme logit values during training.
+- **GQA** — Grouped Query Attention with 8 query heads and 4 KV heads (2:1 grouping), reducing KV cache and parameter count.
+- **Tied Embeddings** — Input and output embeddings share weights, saving parameters.
+- **QK Gain** — Per-head learnable query scaling initialized to 1.5, allowing the model to tune attention sharpness per head.
+
+### GPTQ Mixed-Precision Quantization
+
+Compression pipeline with Hessian collection performed within the 600s training budget (`gptq_reserve_ms=9000` deducted from training wallclock before training begins):
+
+1. **Hessian collection** — 64 calibration batches run through a non-banked model copy to collect per-layer `H = X^T X` approximations, all-reduced across ranks. This runs within the reserved 9s carved from the training budget.
+2. **Dynamic mixed-precision bit allocation** — Base quantization is **int5** for all weight groups. Hessian trace sensitivity ranks tensor groups (by layer × attn/mlp), then a greedy allocator selectively **promotes the most sensitive groups to int6 or int7** until the estimated compressed artifact size approaches the 16MB target minus 2% pruning headroom.
+3. **GPTQ quantization** — Hessian-aware Cholesky error compensation for 2D weight matrices. Columns permuted by descending Hessian diagonal for optimal error propagation. Falls back to percentile search on Cholesky failure.
+4. **Late QAT (soft-round)** — Quantization-aware training activated when LR scale drops below 15%, with soft-round sigmoid alpha ramping 1→16 over the QAT phase. Provides real gradient signal through quantization grid points.
+5. **Selective pruning** — Post-GPTQ, values with `|q| ≤ 2` ranked by reconstruction error impact. Binary search with fast (zlib-1) / real (brotli-11) calibration finds the minimal prune count to fit 16MB.
+6. **Brotli + byte-shuffle compression** — Byte-shuffle preprocessing reorders tensor bytes by significance position before brotli compression (quality=11) for optimal entropy coding.
+
+### Code Shrinking
+
+The submission `train_gpt.py` is a compressed self-extracting wrapper generated by
+`Shrink/shrink.py`. The pipeline: AST dead-code removal → pyminify (strip comments,
+whitespace, type hints, rename identifiers) → LZMA + base85 self-extracting `exec()`.
+
+- **Human-readable source**: `train_gpt_human.py` (123 KB)
+- **Shrunk submission**: `train_gpt.py` (24 KB)
+- **Code budget freed**: ~99 KB → more artifact space for model weights → less pruning
+
+### Test-Time Training (TTT)
+
+Optional legal score-first TTT on the validation set, activated via `TTT_ENABLED=1`. Every token is scored BEFORE any gradient update that could use it, ensuring no data leakage:
+
+1. **Phase 1 (Score)** — Sliding window evaluation under `torch.no_grad()`, scoring all windows whose first scored token falls within the current chunk. Results are accumulated into global byte/token counters.
+2. **Phase 2 (Train)** — SGD (or AdamW) training on the chunk just scored, with configurable epochs, gradient clipping, and learning rate.
+3. **Last chunk** — Scored but never trained on (hard guarantee: no weight mutation after final scoring).
+
+Key features:
+- **Polyak EMA** — Maintains exponential moving average of weights (decay=0.998). Training weights used during gradient updates, Polyak weights swapped in for scoring. Provides stability without sacrificing adaptation speed.
+- **Entropy-adaptive epochs** — Per-chunk epoch count adjusted by chunk entropy: high-entropy (hard) chunks get +1 epoch, low-entropy (easy) chunks get -1.
+- **Budget guard** — Monitors elapsed wall-clock time and stops TTT if the 600s eval budget would be exceeded.
+- **Multi-GPU support** — All-reduces loss/token/byte counts across ranks.
+- **torch.compile compatible** — Optionally compiles the model for TTT forward passes (`TTT_COMPILE=1`).
+
+## Architecture
+
+| Component | Setting |
+|-----------|---------|
+| Layers | 11 (512d, 8H, 4KV GQA) |
+| MLP | 3.5x with LeakyReLU(ASQU v3 per-layer)^2 |
+| XSA | All 11 layers |
+| EngramLite | 2 heads x 2 orders, 8192 buckets |
+| Skip connections | U-Net sigmoid-gated |
+| RoPE | Partial (16 of 64 dims) |
+| LN Scale | 1/sqrt(layer+1) |
+| Logit Softcap | 30.0 |
+| ValueEmbedding | Layers 9-10 |
+| SmearGate | Causal shift blending |
+| Embeddings | Tied input/output |
+| Vocab | 1024 BPE, seq 2048 |
+
+### Optimizer
+
+| Param group | LR | Notes |
+|---|---|---|
+| Bank weights (Muon) | 0.025 | momentum=0.99, WD=0.04 |
+| Embeddings (Adam) | 0.6 | betas=(0.7, 0.95), WD=0.04 |
+| Head/tied embed (Adam) | 0.035 | betas=(0.7, 0.95) |
+| Scalars (Adam) | 0.025 | betas=(0.9, 0.95) |
+
+### Weight Averaging
+
+- **SWA** -- Float32 accumulation every 50 steps after 20% of training
+- **EMA** -- Decay=0.997, lerp_ single-kernel updates
+
+## Dependencies
+
+Requires `brotli>=1.1` (included in `requirements.txt`). The code gracefully falls back to lzma > zlib if brotli is missing, but brotli is needed to maximize model capacity within the 16MB budget.
+
+## Run Command
+
+```bash
+# 8xH100 (competition)
+SEED=42 torchrun --standalone --nproc_per_node=8 train_gpt.py
+
+# Single GPU (development)
+torchrun --standalone --nproc_per_node=1 train_gpt.py
+```
+
+All hyperparameters are configured via environment variables. Defaults match the competition-optimized configuration. Key overrides:
+
+```bash
+# Override examples
+NUM_LAYERS=11 MODEL_DIM=512 MLP_MULT=3.5 \
+MUON_MOMENTUM=0.99 MUON_WD=0.04 MUON_POST_NORM=row_col \
+SWA_ENABLED=1 EMA_ENABLED=1 MIXED_PRECISION=1 LATE_QAT=1 \
+torchrun --standalone --nproc_per_node=8 train_gpt.py
+
+# With TTT enabled (eval phase applies test-time training)
+TTT_ENABLED=1 TTT_LR=0.005 TTT_EPOCHS=5 TTT_OPTIMIZER=sgd \
+torchrun --standalone --nproc_per_node=8 train_gpt.py
+```
+
+### Snapshot Workflow
+
+Train once, iterate on compression without retraining:
+
+```bash
+# Step 1: Train and save snapshot
+SNAPSHOT_POST_HESSIAN=1 torchrun --standalone --nproc_per_node=8 train_gpt.py
+
+# Step 2: Load snapshot, re-run compression + eval only
+LOAD_SNAPSHOT=snapshot_post_hessian.pt torchrun --standalone --nproc_per_node=1 train_gpt.py
+```
+
+## Credits
+
+- **Base recipe**: [PR #609](https://github.com/openai/parameter-golf/pull/609) (1.1154 bpb baseline)
+- **Muon optimizer**: Inspired by [PR #399](https://github.com/openai/parameter-golf/pull/399) parameter banking approach
+- **LeakyReLU^2**: [PR #493](https://github.com/openai/parameter-golf/pull/493), [PR #518](https://github.com/openai/parameter-golf/pull/518)
+- **XSA**: [PR #265](https://github.com/openai/parameter-golf/pull/265), [PR #287](https://github.com/openai/parameter-golf/pull/287)
+- **SmearGate + BigramHash**: [PR #198](https://github.com/openai/parameter-golf/pull/198) and related submissions
+- **Polar Express coefficients**: Amsel et al. (arXiv:2505.16932)
+- **GPTQ approach**: [PR #634](https://github.com/openai/parameter-golf/pull/634) Hessian-aware quantization
+
diff --git a/records/track_10min_16mb/2026-03-28_TurboMuon_EngramLite_ParamBanking/requirements.txt b/records/track_10min_16mb/2026-03-28_TurboMuon_EngramLite_ParamBanking/requirements.txt
@@ -0,0 +1,12 @@
+# Extra dependencies beyond the standard Parameter Golf competition environment.
+# The competition VM already provides: torch, numpy, sentencepiece, zlib (stdlib).
+#
+# Minimum versions:
+#   Python >= 3.12
+#   torch  >= 2.11 (torch.compile fullgraph improvements, CUDA 13.0 support)
+#
+# brotli: ~1-5% better compression than lzma/zstd on int6-quantized weights.
+# The code falls back gracefully (lzma → zlib) if missing, but brotli is
+# needed to hit the 16MB artifact cap with maximum model capacity.
+torch>=2.11
+brotli>=1.1
diff --git a/records/track_10min_16mb/2026-03-28_TurboMuon_EngramLite_ParamBanking/submission.json b/records/track_10min_16mb/2026-03-28_TurboMuon_EngramLite_ParamBanking/submission.json
@@ -0,0 +1,9 @@
+{
+  "name": "Turbo-Muon + EngramLite + Parameter Banking + GPTQ Mixed-Precision",
+  "val_bpb": "1.1091",
+  "bytes_total": "15993904",
+  "blurb": "11L/512d GPT with Turbo-Muon (AOL+Polar Express+row_col norm), EngramLite bigram+trigram hash embeddings, U-Net skip connections, ValueEmbedding, SmearGate, SWA+EMA, GPTQ mixed-precision int6/int7 with Hessian sensitivity, brotli+byte-shuffle compression. Built on PR #609.",
+  "author": "mikeapedia",
+  "github_id": "mikeapedia",
+  "date": "2026-03-28"
+}