openai · JoeProAI · Mar 28, 2026
diff --git a/records/track_10min_16mb/2026-03-28_JoeProAI_11L_Int5_TTT_1.1356_seed314/model.ptz b/records/track_10min_16mb/2026-03-28_JoeProAI_11L_Int5_TTT_1.1356_seed314/model.ptz
diff --git a/records/track_10min_16mb/2026-03-28_JoeProAI_11L_Int5_TTT_1.1356_seed314/requirements.txt b/records/track_10min_16mb/2026-03-28_JoeProAI_11L_Int5_TTT_1.1356_seed314/requirements.txt
@@ -0,0 +1,7 @@
+torch>=2.4.0
+numpy
+zstandard
+huggingface_hub
+datasets
+sentencepiece
+tqdm
diff --git a/records/track_10min_16mb/2026-03-28_JoeProAI_11L_Int5_TTT_1.1356_seed314/run_training.sh b/records/track_10min_16mb/2026-03-28_JoeProAI_11L_Int5_TTT_1.1356_seed314/run_training.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# GlassBridge / JoeProAI — Parameter Golf submission runner
+# Reproduces val_bpb=1.13256182 on 8xH100
+#
+# Usage:
+#   bash run_training.sh
+#
+# Requirements:
+#   - 8x NVIDIA H100 (80GB) GPUs
+#   - Python 3.10+, CUDA 12.4+
+#   - pip install -r requirements.txt
+#   - Data: fineweb10B_sp1024 dataset at $DATA_PATH
+#   - Tokenizer: fineweb_1024_bpe.model at $TOKENIZER_PATH
+
+set -e
+
+# ── Paths (edit these) ──────────────────────────────────────────────────────
+DATA_PATH="${DATA_PATH:-./data/datasets/fineweb10B_sp1024}"
+TOKENIZER_PATH="${TOKENIZER_PATH:-./data/tokenizers/fineweb_1024_bpe.model}"
+
+# ── Training hyperparameters ─────────────────────────────────────────────────
+export MATRIX_LR="0.025"
+export SCALAR_LR="0.025"
+export MUON_WD="0.0"
+export ADAM_WD="0.0"
+export GRAD_CLIP_NORM="0.0"
+export MUON_MOMENTUM="0.95"
+export WARMDOWN_ITERS="6000"
+
+# ── TTT (Test-Time Training) config ──────────────────────────────────────────
+export TTT_ENABLED="1"
+export TTT_USE_ADAMW="1"
+export TTT_ADAMW_LR="0.0004"
+export TTT_ADAMW_WD="0.0"
+export TTT_MLP_ONLY="1"
+export TTT_EPOCHS="1"
+export TTT_FREEZE_BLOCKS="0"
+
+# ── Architecture ─────────────────────────────────────────────────────────────
+export MLP_HIDDEN="1536"
+export BIGRAM_BUCKETS="4096"
+export PRUNE_PCT="0.15"
+
+# ── Reproducibility ───────────────────────────────────────────────────────────
+export SEED="314"
+
+echo "Starting training run..."
+echo "DATA_PATH: $DATA_PATH"
+echo "TOKENIZER_PATH: $TOKENIZER_PATH"
+
+DATA_PATH="$DATA_PATH" \
+TOKENIZER_PATH="$TOKENIZER_PATH" \
+torchrun --nproc_per_node=8 train_gpt.py
+
+echo "Training complete. Artifact: final_model.int5.ptz"
diff --git a/records/track_10min_16mb/2026-03-28_JoeProAI_11L_Int5_TTT_1.1356_seed314/submission.json b/records/track_10min_16mb/2026-03-28_JoeProAI_11L_Int5_TTT_1.1356_seed314/submission.json
@@ -0,0 +1,44 @@
+{
+  "name": "JoeProAI",
+  "github_id": "JoeProAI",
+  "val_bpb": 1.13557402,
+  "val_loss": 1.91736672,
+  "compressed_size_bytes": 16361752,
+  "training_time_seconds": 1999,
+  "techniques": [
+    "int5_quantization_per_row",
+    "zstd_22_compression",
+    "bigram_hash_embedding",
+    "swiglu_mlp",
+    "xsa_attention_all_layers",
+    "u_net_skip_connections",
+    "muon_optimizer",
+    "score_first_legal_ttt",
+    "adamw_ttt_mlp_only",
+    "weight_pruning_015",
+    "fp16_embedding_passthrough",
+    "warmdown_6000"
+  ],
+  "architecture": {
+    "num_layers": 11,
+    "model_dim": 512,
+    "num_heads": 8,
+    "mlp_hidden": 1536,
+    "bigram_buckets": 4096,
+    "bigram_embed_dim": 128,
+    "vocab_size": 256,
+    "tie_embeddings": false
+  },
+  "hyperparameters": {
+    "matrix_lr": 0.025,
+    "muon_momentum": 0.95,
+    "warmdown_iters": 6000,
+    "prune_pct": 0.15,
+    "ttt_adamw_lr": 0.0004,
+    "ttt_epochs": 1,
+    "ttt_mlp_only": true,
+    "seed": 314
+  },
+  "notes": "11-layer U-Net GPT with SwiGLU MLP, XSA on all layers, int5 QAT with per-row scale, score-first legal TTT (AdamW, MLP-only). Trained 600s on 8xH100. Seed 314.",
+  "date": "2026-03-28"
+}