Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 133 additions & 0 deletions Shrink/shrink.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import ast
import subprocess
import lzma
import base64
import os
import sys

class DeadCodeRemover(ast.NodeTransformer):
def visit_If(self, node):
source = ast.unparse(node.test)

# Dead config branches based on defaults — these conditions are False
# with default config, so their bodies never execute during competition.
# IMPORTANT: Only match the affirmative test, NOT negations like
# "not args.load_snapshot" which guards the training loop (always True).
dead_false_tests = [
"args.load_snapshot", # snapshot restore (default="" → falsy)
"args.snapshot_post_hessian", # snapshot save (default=False)
]

for dt in dead_false_tests:
if source == dt:
if node.orelse:
# Cannot safely remove — else/elif branch must be preserved.
# Skip removal and keep the full if/else structure.
break
return None

self.generic_visit(node)
return node

def shrink_pipeline(input_file, output_file):
print(f"[*] Starting shrinking pipeline for: {input_file}")

if not os.path.exists(input_file):
print(f"[!] Error: Input file '{input_file}' not found.")
return

# 1. Read Original Source
with open(input_file, 'r', encoding='utf-8') as f:
source = f.read()
print(f"[*] Original size: {len(source)} bytes")

# 2. Prune AST (Dead Code Elimination)
print(f"[*] Pruning dead code and evaluation logic from AST...")
tree = ast.parse(source)
transformer = DeadCodeRemover()
tree = transformer.visit(tree)
ast.fix_missing_locations(tree)

pruned_source = ast.unparse(tree)
temp_pruned_file = input_file + ".pruned.tmp.py"
temp_minified_file = input_file + ".min.tmp.py"

def _cleanup_temps():
for f in [temp_pruned_file, temp_minified_file]:
if os.path.exists(f):
os.remove(f)

with open(temp_pruned_file, 'w', encoding='utf-8') as f:
f.write(pruned_source)

# 3. Minify using pyminify
print(f"[*] Running pyminify to minimize identifiers and strip whitespace/comments/hints...")
try:
result = subprocess.run([
"uvx", "--from", "python-minifier", "pyminify", temp_pruned_file,
"--output", temp_minified_file,
"--remove-literal-statements",
"--remove-asserts",
"--remove-debug",
"--remove-class-attribute-annotations"
], check=True, capture_output=True, text=True)
except FileNotFoundError:
print(f"[!] 'uvx' not found on PATH. Install uv or run: pip install python-minifier")
_cleanup_temps()
return
except subprocess.CalledProcessError as e:
print(f"[!] PyMinify failed (exit {e.returncode}):")
if e.stderr:
print(e.stderr)
_cleanup_temps()
return

try:
with open(temp_minified_file, 'rb') as f:
minified_bytes = f.read()
print(f"[*] Minified size: {len(minified_bytes)} bytes")

# 4. LZMA + Base85 Self-Extracting Compression
print(f"[*] Compressing into LZMA Base85 executable wrap...")
compressed = lzma.compress(minified_bytes, format=lzma.FORMAT_RAW, filters=[{"id": lzma.FILTER_LZMA2, "preset": 9 | lzma.PRESET_EXTREME}])
b85_encoded = base64.b85encode(compressed).decode('ascii')

# Chunk the b85 string to avoid overly long single lines
chunk_size = 100
chunks = [b85_encoded[i:i+chunk_size] for i in range(0, len(b85_encoded), chunk_size)]
formatted_b85 = '"\n"'.join(chunks)

header = f"""import lzma as L,base64 as B\nexec(L.decompress(B.b85decode(("{formatted_b85}")),format=L.FORMAT_RAW,filters=[{{"id":L.FILTER_LZMA2}}]))\n"""

with open(output_file, 'w', encoding='ascii') as f:
f.write(header)

final_size = os.path.getsize(output_file)
print(f"[*] Packed output size: {final_size} bytes")
print(f"[*] Total Reduction: {((len(source) - final_size) / len(source) * 100):.1f}%")
print(f"[*] Success! Optimized submission saved to: {output_file}")
finally:
# 5. Clean up temporary files (always, even on error)
_cleanup_temps()

if __name__ == '__main__':
if len(sys.argv) == 3:
# Direct mode: shrink.py <input> <output>
shrink_pipeline(sys.argv[1], sys.argv[2])
elif len(sys.argv) == 1:
# Legacy rename-and-shrink mode (first-time setup only)
human_file = "train_gpt_human.py"
output_file = "train_gpt.py"
if not os.path.exists(output_file):
print(f"[!] Error: '{output_file}' not found. Use: python shrink.py <input> <output>")
sys.exit(1)
if os.path.exists(human_file):
print(f"[!] Error: '{human_file}' already exists. Use: python shrink.py {human_file} {output_file}")
sys.exit(1)
os.rename(output_file, human_file)
print(f"[*] Renamed '{output_file}' to '{human_file}'")
shrink_pipeline(human_file, output_file)
else:
print(f"Usage: python shrink.py <input_file> <output_file>")
print(f" python shrink.py (legacy: rename train_gpt.py -> train_gpt_human.py, then shrink)")
sys.exit(1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
# Turbo-Muon + EngramLite + Parameter Banking + GPTQ Mixed-Precision

**val_bpb: 1.1091** (3-seed mean, std 0.0005) | **~15.3 MB** | 8xH100 SXM

## Results (8xH100 80GB SXM)

| Seed | step_avg | steps | val_bpb (SW) | val_bpb (full) | Artifact bytes |
|------|----------|-------|-------------|----------------|----------------|
| 42 | 93.26ms | 6284 | 1.1086 | 1.1324 | 15,992,528 |
| 1337 | 93.11ms | 6295 | 1.1090 | 1.1328 | 15,993,413 |
| 2025 | 93.11ms | 6294 | 1.1096 | 1.1335 | 15,993,904 |
| **Mean** | **93.16ms** | **6291** | **1.1091** | **1.1329** | |

## Summary

An 11-layer GPT language model combining seven key innovations over the PR #609 baseline, targeting the 16MB artifact budget at MLP 3.5x width. Development-run benchmark: **1.1119 val_bpb (sliding window)** on 1xH100.

## Key Innovations

### Turbo-Muon Optimizer

A variant of the Muon optimizer with three enhancements that reduce Newton-Schulz iterations from 5 to 4:

- **AOL Preconditioning** -- Gershgorin-based diagonal scaling contracts the singular value range before Newton-Schulz iteration, allowing the first NS step to be skipped.
- **Polar Express Coefficients** -- Optimal degree-5 polynomial coefficients from Amsel et al. (arXiv:2505.16932), applied per-iteration rather than fixed.
- **Post-NS row_col Normalization** -- After orthogonalization, rows then columns are normalized. This consistently outperforms row-only or no normalization.

### EngramLite Hash Embeddings

Multi-head prime-based hash embedding that captures bigram and trigram statistics without explicit tokenizer changes:
- 2 heads x 2 orders (bigram + trigram) with 8192 hash buckets
- Projects to model_dim through a learned sigmoid gate
- Adds character-level context at minimal parameter cost

### Parameter Banking

All per-layer linear weights stored in contiguous 3D tensors (`qo_bank`, `kv_bank`, `mlp_up_bank`, `mlp_down_bank`). This enables batched Newton-Schulz orthogonalization via `torch.bmm`, dramatically reducing Muon optimizer overhead compared to per-layer iteration.

### U-Net Skip Connections

Encoder/decoder structure with learned sigmoid-gated skip connections. Gates start at `sigmoid(0) = 0.5` and learn per-dimension blending, preventing gradient shortcutting at initialization.

### ValueEmbedding

Reinjects token identity into attention values at deep layers (9, 10). Projects vocabulary embeddings to kv_dim with per-layer learned scaling, helping the model maintain token-level information through deep attention stacks.

### SmearGate

`F.pad`-based causal shift blending each token with its predecessor, providing free unigram context at zero attention cost.

### XSA (Cross-Sequence Attention) — All Layers

Efficient XSA applied to all 11 layers (`XSA_LAST_N=11`). Subtracts the self-value projection from attention output via GQA-aware reshape (no `repeat_interleave`), encouraging the model to attend to context rather than the current token's own representation.

### ASQU v3 Per-Layer Activation Slopes

Fixed per-layer LeakyReLU negative slopes discovered through 3 rounds of Adaptive Slope parameter tuning:
`[-0.014, 0.131, 0.225, 0.265, 0.310, 0.354, 0.421, 0.429, 0.417, 0.358, 0.468]`

Layer 0 uses near-ReLU² (slope ≈ 0) for aggressive feature selection, while deeper layers use progressively larger negative slopes (up to 0.468) to allow smoother gradient flow. Hard-coded converged endpoints eliminate learnable slope parameters.

### Mimetic V-O Initialization

Output projections initialized as `O_h = -alpha * V_h` per head (alpha=0.05), creating a small residual-like identity at init for improved early training stability.

### Additional Architecture Details

- **Partial RoPE** — Rotary position embeddings applied to only 16 of 64 head dimensions (`ROPE_DIMS=16`). Remaining dimensions are position-free, giving the model both positional and position-invariant feature channels.
- **LN Scale** — Layer norm outputs scaled by `1/sqrt(layer_idx + 1)`, stabilizing deeper layers by reducing activation magnitudes proportional to depth.
- **Logit Softcap** — `softcap * tanh(logits / softcap)` with softcap=30.0 prevents extreme logit values during training.
- **GQA** — Grouped Query Attention with 8 query heads and 4 KV heads (2:1 grouping), reducing KV cache and parameter count.
- **Tied Embeddings** — Input and output embeddings share weights, saving parameters.
- **QK Gain** — Per-head learnable query scaling initialized to 1.5, allowing the model to tune attention sharpness per head.

### GPTQ Mixed-Precision Quantization

Compression pipeline with Hessian collection performed within the 600s training budget (`gptq_reserve_ms=9000` deducted from training wallclock before training begins):

1. **Hessian collection** — 64 calibration batches run through a non-banked model copy to collect per-layer `H = X^T X` approximations, all-reduced across ranks. This runs within the reserved 9s carved from the training budget.
2. **Dynamic mixed-precision bit allocation** — Base quantization is **int5** for all weight groups. Hessian trace sensitivity ranks tensor groups (by layer × attn/mlp), then a greedy allocator selectively **promotes the most sensitive groups to int6 or int7** until the estimated compressed artifact size approaches the 16MB target minus 2% pruning headroom.
3. **GPTQ quantization** — Hessian-aware Cholesky error compensation for 2D weight matrices. Columns permuted by descending Hessian diagonal for optimal error propagation. Falls back to percentile search on Cholesky failure.
4. **Late QAT (soft-round)** — Quantization-aware training activated when LR scale drops below 15%, with soft-round sigmoid alpha ramping 1→16 over the QAT phase. Provides real gradient signal through quantization grid points.
5. **Selective pruning** — Post-GPTQ, values with `|q| ≤ 2` ranked by reconstruction error impact. Binary search with fast (zlib-1) / real (brotli-11) calibration finds the minimal prune count to fit 16MB.
6. **Brotli + byte-shuffle compression** — Byte-shuffle preprocessing reorders tensor bytes by significance position before brotli compression (quality=11) for optimal entropy coding.

### Code Shrinking

The submission `train_gpt.py` is a compressed self-extracting wrapper generated by
`Shrink/shrink.py`. The pipeline: AST dead-code removal → pyminify (strip comments,
whitespace, type hints, rename identifiers) → LZMA + base85 self-extracting `exec()`.

- **Human-readable source**: `train_gpt_human.py` (123 KB)
- **Shrunk submission**: `train_gpt.py` (24 KB)
- **Code budget freed**: ~99 KB → more artifact space for model weights → less pruning

### Test-Time Training (TTT)

Optional legal score-first TTT on the validation set, activated via `TTT_ENABLED=1`. Every token is scored BEFORE any gradient update that could use it, ensuring no data leakage:

1. **Phase 1 (Score)** — Sliding window evaluation under `torch.no_grad()`, scoring all windows whose first scored token falls within the current chunk. Results are accumulated into global byte/token counters.
2. **Phase 2 (Train)** — SGD (or AdamW) training on the chunk just scored, with configurable epochs, gradient clipping, and learning rate.
3. **Last chunk** — Scored but never trained on (hard guarantee: no weight mutation after final scoring).

Key features:
- **Polyak EMA** — Maintains exponential moving average of weights (decay=0.998). Training weights used during gradient updates, Polyak weights swapped in for scoring. Provides stability without sacrificing adaptation speed.
- **Entropy-adaptive epochs** — Per-chunk epoch count adjusted by chunk entropy: high-entropy (hard) chunks get +1 epoch, low-entropy (easy) chunks get -1.
- **Budget guard** — Monitors elapsed wall-clock time and stops TTT if the 600s eval budget would be exceeded.
- **Multi-GPU support** — All-reduces loss/token/byte counts across ranks.
- **torch.compile compatible** — Optionally compiles the model for TTT forward passes (`TTT_COMPILE=1`).

## Architecture

| Component | Setting |
|-----------|---------|
| Layers | 11 (512d, 8H, 4KV GQA) |
| MLP | 3.5x with LeakyReLU(ASQU v3 per-layer)^2 |
| XSA | All 11 layers |
| EngramLite | 2 heads x 2 orders, 8192 buckets |
| Skip connections | U-Net sigmoid-gated |
| RoPE | Partial (16 of 64 dims) |
| LN Scale | 1/sqrt(layer+1) |
| Logit Softcap | 30.0 |
| ValueEmbedding | Layers 9-10 |
| SmearGate | Causal shift blending |
| Embeddings | Tied input/output |
| Vocab | 1024 BPE, seq 2048 |

### Optimizer

| Param group | LR | Notes |
|---|---|---|
| Bank weights (Muon) | 0.025 | momentum=0.99, WD=0.04 |
| Embeddings (Adam) | 0.6 | betas=(0.7, 0.95), WD=0.04 |
| Head/tied embed (Adam) | 0.035 | betas=(0.7, 0.95) |
| Scalars (Adam) | 0.025 | betas=(0.9, 0.95) |

### Weight Averaging

- **SWA** -- Float32 accumulation every 50 steps after 20% of training
- **EMA** -- Decay=0.997, lerp_ single-kernel updates

## Dependencies

Requires `brotli>=1.1` (included in `requirements.txt`). The code gracefully falls back to lzma > zlib if brotli is missing, but brotli is needed to maximize model capacity within the 16MB budget.

## Run Command

```bash
# 8xH100 (competition)
SEED=42 torchrun --standalone --nproc_per_node=8 train_gpt.py

# Single GPU (development)
torchrun --standalone --nproc_per_node=1 train_gpt.py
```

All hyperparameters are configured via environment variables. Defaults match the competition-optimized configuration. Key overrides:

```bash
# Override examples
NUM_LAYERS=11 MODEL_DIM=512 MLP_MULT=3.5 \
MUON_MOMENTUM=0.99 MUON_WD=0.04 MUON_POST_NORM=row_col \
SWA_ENABLED=1 EMA_ENABLED=1 MIXED_PRECISION=1 LATE_QAT=1 \
torchrun --standalone --nproc_per_node=8 train_gpt.py

# With TTT enabled (eval phase applies test-time training)
TTT_ENABLED=1 TTT_LR=0.005 TTT_EPOCHS=5 TTT_OPTIMIZER=sgd \
torchrun --standalone --nproc_per_node=8 train_gpt.py
```

### Snapshot Workflow

Train once, iterate on compression without retraining:

```bash
# Step 1: Train and save snapshot
SNAPSHOT_POST_HESSIAN=1 torchrun --standalone --nproc_per_node=8 train_gpt.py

# Step 2: Load snapshot, re-run compression + eval only
LOAD_SNAPSHOT=snapshot_post_hessian.pt torchrun --standalone --nproc_per_node=1 train_gpt.py
```

## Credits

- **Base recipe**: [PR #609](https://github.com/openai/parameter-golf/pull/609) (1.1154 bpb baseline)
- **Muon optimizer**: Inspired by [PR #399](https://github.com/openai/parameter-golf/pull/399) parameter banking approach
- **LeakyReLU^2**: [PR #493](https://github.com/openai/parameter-golf/pull/493), [PR #518](https://github.com/openai/parameter-golf/pull/518)
- **XSA**: [PR #265](https://github.com/openai/parameter-golf/pull/265), [PR #287](https://github.com/openai/parameter-golf/pull/287)
- **SmearGate + BigramHash**: [PR #198](https://github.com/openai/parameter-golf/pull/198) and related submissions
- **Polar Express coefficients**: Amsel et al. (arXiv:2505.16932)
- **GPTQ approach**: [PR #634](https://github.com/openai/parameter-golf/pull/634) Hessian-aware quantization

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Extra dependencies beyond the standard Parameter Golf competition environment.
# The competition VM already provides: torch, numpy, sentencepiece, zlib (stdlib).
#
# Minimum versions:
# Python >= 3.12
# torch >= 2.11 (torch.compile fullgraph improvements, CUDA 13.0 support)
#
# brotli: ~1-5% better compression than lzma/zstd on int6-quantized weights.
# The code falls back gracefully (lzma → zlib) if missing, but brotli is
# needed to hit the 16MB artifact cap with maximum model capacity.
torch>=2.11
brotli>=1.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"name": "Turbo-Muon + EngramLite + Parameter Banking + GPTQ Mixed-Precision",
"val_bpb": "1.1091",
"bytes_total": "15993904",
"blurb": "11L/512d GPT with Turbo-Muon (AOL+Polar Express+row_col norm), EngramLite bigram+trigram hash embeddings, U-Net skip connections, ValueEmbedding, SmearGate, SWA+EMA, GPTQ mixed-precision int6/int7 with Hessian sensitivity, brotli+byte-shuffle compression. Built on PR #609.",
"author": "mikeapedia",
"github_id": "mikeapedia",
"date": "2026-03-28"
}
Loading