Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
torch>=2.4.0
numpy
zstandard
huggingface_hub
datasets
sentencepiece
tqdm
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash
# GlassBridge / JoeProAI — Parameter Golf submission runner
# Reproduces val_bpb=1.13256182 on 8xH100
#
# Usage:
# bash run_training.sh
#
# Requirements:
# - 8x NVIDIA H100 (80GB) GPUs
# - Python 3.10+, CUDA 12.4+
# - pip install -r requirements.txt
# - Data: fineweb10B_sp1024 dataset at $DATA_PATH
# - Tokenizer: fineweb_1024_bpe.model at $TOKENIZER_PATH

set -e

# ── Paths (edit these) ──────────────────────────────────────────────────────
DATA_PATH="${DATA_PATH:-./data/datasets/fineweb10B_sp1024}"
TOKENIZER_PATH="${TOKENIZER_PATH:-./data/tokenizers/fineweb_1024_bpe.model}"

# ── Training hyperparameters ─────────────────────────────────────────────────
export MATRIX_LR="0.025"
export SCALAR_LR="0.025"
export MUON_WD="0.0"
export ADAM_WD="0.0"
export GRAD_CLIP_NORM="0.0"
export MUON_MOMENTUM="0.95"
export WARMDOWN_ITERS="6000"

# ── TTT (Test-Time Training) config ──────────────────────────────────────────
export TTT_ENABLED="1"
export TTT_USE_ADAMW="1"
export TTT_ADAMW_LR="0.0004"
export TTT_ADAMW_WD="0.0"
export TTT_MLP_ONLY="1"
export TTT_EPOCHS="1"
export TTT_FREEZE_BLOCKS="0"

# ── Architecture ─────────────────────────────────────────────────────────────
export MLP_HIDDEN="1536"
export BIGRAM_BUCKETS="4096"
export PRUNE_PCT="0.15"

# ── Reproducibility ───────────────────────────────────────────────────────────
export SEED="314"

echo "Starting training run..."
echo "DATA_PATH: $DATA_PATH"
echo "TOKENIZER_PATH: $TOKENIZER_PATH"

DATA_PATH="$DATA_PATH" \
TOKENIZER_PATH="$TOKENIZER_PATH" \
torchrun --nproc_per_node=8 train_gpt.py

echo "Training complete. Artifact: final_model.int5.ptz"
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"name": "JoeProAI",
"github_id": "JoeProAI",
"val_bpb": 1.13557402,
"val_loss": 1.91736672,
"compressed_size_bytes": 16361752,
"training_time_seconds": 1999,
"techniques": [
"int5_quantization_per_row",
"zstd_22_compression",
"bigram_hash_embedding",
"swiglu_mlp",
"xsa_attention_all_layers",
"u_net_skip_connections",
"muon_optimizer",
"score_first_legal_ttt",
"adamw_ttt_mlp_only",
"weight_pruning_015",
"fp16_embedding_passthrough",
"warmdown_6000"
],
"architecture": {
"num_layers": 11,
"model_dim": 512,
"num_heads": 8,
"mlp_hidden": 1536,
"bigram_buckets": 4096,
"bigram_embed_dim": 128,
"vocab_size": 256,
"tie_embeddings": false
},
"hyperparameters": {
"matrix_lr": 0.025,
"muon_momentum": 0.95,
"warmdown_iters": 6000,
"prune_pct": 0.15,
"ttt_adamw_lr": 0.0004,
"ttt_epochs": 1,
"ttt_mlp_only": true,
"seed": 314
},
"notes": "11-layer U-Net GPT with SwiGLU MLP, XSA on all layers, int5 QAT with per-row scale, score-first legal TTT (AdamW, MLP-only). Trained 600s on 8xH100. Seed 314.",
"date": "2026-03-28"
}
Loading