From da436e03f2fe637a8a0d07f4ec4464907d4e901c Mon Sep 17 00:00:00 2001 From: serkankarakulak Date: Mon, 23 Mar 2026 01:45:54 -0400 Subject: [PATCH] =?UTF-8?q?Record:=201.1558=20BPB=20=E2=80=94=2011L=20U-Ne?= =?UTF-8?q?t=20+=20Catalytic=20+=20SwiGLU=20+=20SW64?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 11 layers with gated U-Net skip connections, catalytic residuals, SwiGLU MLP, value residual, sliding window eval (stride=64). Int5/Int6 mixed quantization + zstd-22. 15.1MB artifact. Co-Authored-By: Claude Opus 4.6 --- .../README.md | 49 + .../submission.json | 13 + .../train_gpt.py | 2142 +++++++++++++++++ .../train_seed1337.log | 130 + 4 files changed, 2334 insertions(+) create mode 100644 records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/README.md create mode 100644 records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/submission.json create mode 100644 records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/train_gpt.py create mode 100644 records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/train_seed1337.log diff --git a/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/README.md b/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/README.md new file mode 100644 index 000000000..d8501ca06 --- /dev/null +++ b/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/README.md @@ -0,0 +1,49 @@ +# 11L U-Net + Catalytic Residuals + SwiGLU + Sliding Window + +**val_bpb: 1.1558** (1 seed) + +## Key Techniques + +1. **11 transformer layers** with U-Net gated skip connections: Sigmoid-gated blending between encoder and decoder layers (`gate * x + (1-gate) * (weight * skip)`). 5 encoder + 1 mid + 5 decoder layers. + +2. **Catalytic residuals** (PR #450): Learned per-dim gates on attention and MLP outputs, initialized to 1.0. Zero compute overhead, ~0.024 BPB improvement. + +3. **SwiGLU MLP**: Gated linear unit with SiLU activation, 3× expansion factor. + +4. **Value residual** (ResFormer): Blend first-layer V into all subsequent layers for better gradient flow. + +5. **Sliding window evaluation** (stride=64, seq_len=1024): Every token scored with 960+ context. + +6. **LN scale dampening**: `1/sqrt(layer_idx+1)` on RMSNorm inputs for deeper layers. + +7. **Decoder LR multiplier** (2×): Decoder layers get higher learning rate for both Muon and Adam. + +8. **Int5/Int6 mixed quantization + zstd-22**: MLP and bigram weights at 5-bit, rest at 6-bit. + +9. **BigramHash** (4096 buckets, 128-dim): Bigram-conditioned token embeddings via hash-based lookup. + +10. **EMA** (decay=0.9985): Exponential moving average of weights. + +## Architecture + +- 11 blocks: dim=512, 8 attn heads, 4 KV heads (GQA), 3× MLP +- Vocab: 1024 (SentencePiece BPE), tied embeddings +- Seq len: 2048 (train), 1024 (eval sliding window) +- Partial RoPE (25% of head dims) +- XSA on last 4 layers, gated attention + +## Results + +| Seed | val_loss | val_bpb | Steps | ms/step | +|------|----------|---------|-------|---------| +| 1337 | 1.9516 | 1.1558 | 6898 | 87.0 | + +Pre-quant EMA: val_bpb=1.1606 | Post-quant: val_bpb=1.1723 | Sliding window: val_bpb=1.1558 + +Artifact: 15.1 MB (15,192,709 bytes) | Eval time: ~172s (sliding window) + +## Training + +- Muon optimizer (momentum=0.99, WD=0.04) + Adam (scalar params) +- 524,288 tokens/step, 6,898 steps in 600s on 8×H100 SXM +- Warmdown: 3000 iters, grad clip: 0.3 diff --git a/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/submission.json b/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/submission.json new file mode 100644 index 000000000..4c1aa7e31 --- /dev/null +++ b/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/submission.json @@ -0,0 +1,13 @@ +{ + "track": "10min_16mb", + "date": "2026-03-23", + "name": "11L U-Net + Catalytic Residuals + SwiGLU + Sliding Window", + "author": "serkankarakulak", + "seed_results": { + "1337": {"val_loss": 1.95156679, "val_bpb": 1.15582778, "steps": 6898, "ms_per_step": 86.99} + }, + "mean_val_loss": 1.95156679, + "mean_val_bpb": 1.15582778, + "artifact_bytes": 15096450, + "code_bytes": 96259 +} diff --git a/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/train_gpt.py b/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/train_gpt.py new file mode 100644 index 000000000..921e5d268 --- /dev/null +++ b/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/train_gpt.py @@ -0,0 +1,2142 @@ +""" +The `train_gpt.py` and `train_gpt_mlx.py` scripts are intended as good launching-off points for new participants, not SOTA configs. We'll accept PRs that tune, improve, or simplify these scripts without significantly increasing complexity, but competitive submissions should stay in the `/records` folder. + +Hard stop: To keep readable for newcomers, let's make sure `train_gpt.py` and `train_gpt_mlx.py` never are longer than 1500 lines. +""" + +from __future__ import annotations + +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +import zstandard as zstd +from pathlib import Path + +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP + +# ----------------------------- +# HYPERPARAMETERS +# ----------------------------- +# Default Simple Baseline run: +# - 9 transformer blocks at width 512 +# - 8 attention heads with 4 KV heads (GQA) and 2x MLP expansion +# - vocab size 1024, sequence length 1024, tied embeddings +# - 524,288 train tokens per step for 20,000 iterations with a ~10 minute cap + +class Hyperparameters: + # Data paths are shard globs produced by the existing preprocessing pipeline. + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + + # Validation cadence and batch size. Validation always uses the full fineweb_val split. + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 1000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 200)) + + # Training length. + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3000)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 524_288)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + + # Model shape. + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = int(os.environ.get("MLP_MULT", 3)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + + # Optimizer hyperparameters. + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + muon_weight_decay = float(os.environ.get("MUON_WEIGHT_DECAY", 0.04)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + + # SmearGate + BigramHash + smeargate_dim = int(os.environ.get("SMEARGATE_DIM", 12)) + bigram_hash_buckets = int(os.environ.get("BIGRAM_HASH_BUCKETS", 4096)) + bigram_embed_dim = int(os.environ.get("BIGRAM_EMBED_DIM", 128)) + + # Sliding window evaluation + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 1024)) + eval_batch_size = int(os.environ.get("EVAL_BATCH_SIZE", 16)) + + # TTT LoRA evaluation (disabled by default — set TTT_ENABLED=1 to enable) + ttt_enabled = bool(int(os.environ.get("TTT_ENABLED", "1"))) + + # Weight averaging + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "0"))) + swa_start_frac = float(os.environ.get("SWA_START_FRAC", 0.75)) + swa_interval = int(os.environ.get("SWA_INTERVAL", 200)) + ema_enabled = bool(int(os.environ.get("EMA_ENABLED", "1"))) + ema_decay = float(os.environ.get("EMA_DECAY", 0.9985)) + + # XSA (Exclusive Self Attention): remove self-value bias from last N layers + xsa_last_n = int(os.environ.get("XSA_LAST_N", 4)) + + # Late QAT: STE fake-quantize weights during last N% of training (DISABLED: diverges with SwiGLU) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + qat_start_frac = float(os.environ.get("QAT_START_FRAC", 0.9)) # last 10% of training + + # Full AdamW TTT: fine-tune entire model on val tokens at eval time (PR #442: -0.019 BPB) + ttt_sgd_enabled = bool(int(os.environ.get("TTT_SGD_ENABLED", "1"))) + ttt_sgd_epochs = int(os.environ.get("TTT_SGD_EPOCHS", 10)) + ttt_sgd_batch_seqs = int(os.environ.get("TTT_SGD_BATCH_SEQS", 32)) + ttt_sgd_lr = float(os.environ.get("TTT_SGD_LR", 0.0005)) + ttt_sgd_freeze_layers = int(os.environ.get("TTT_SGD_FREEZE_LAYERS", 0)) # 0 = unfreeze all + + # Partial RoPE: fraction of head dims to apply rotary embeddings to (rest are position-free) + rope_frac = float(os.environ.get("ROPE_FRAC", 0.25)) # 25% = 16/64 dims + + # Backout Connection: subtract lambda * h_mid from final hidden state + backout_enabled = bool(int(os.environ.get("BACKOUT_ENABLED", "1"))) + + # SwiGLU MLP: gated linear unit with SiLU activation (PR #462: enables smoother TTT landscape) + use_swiglu = bool(int(os.environ.get("USE_SWIGLU", "1"))) + + # Value Residual (ResFormer): blend V from first layer into all subsequent layers (PR #487: -0.015 BPB) + value_residual = bool(int(os.environ.get("VALUE_RESIDUAL", "1"))) + + # Gated Attention: per-head sigmoid gate on attention output (PR #487: -0.003 BPB) + gated_attention = bool(int(os.environ.get("GATED_ATTENTION", "1"))) + + # LN Scale: damp RMSNorm inputs by 1/sqrt(layer_idx+1) for deeper layers (PR #315: -0.002 BPB) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + + # Catalytic Residuals: learned per-dim gates on attn/MLP outputs (PR #450: -0.024 BPB, zero compute) + catalytic = bool(int(os.environ.get("CATALYTIC", "1"))) + + # Decoder LR multiplier: decoder layers get higher LR (PR #462: 2.0x) + decoder_lr_mult = float(os.environ.get("DECODER_LR_MULT", 2.0)) + + # Cosine TTT decay: cosine annealing for TTT learning rate (PR #481: -0.006 BPB) + ttt_cosine_decay = bool(int(os.environ.get("TTT_COSINE_DECAY", "1"))) + + # Int6 quantization + quant_bits = int(os.environ.get("QUANT_BITS", 6)) + + # Paired Head Attention: layers where adjacent heads are paired (zero extra params) + paired_head_layers = os.environ.get("PAIRED_HEAD_LAYERS", "") + + # Test-time training (LoRA) hyperparameters. + ttt_lora_rank = int(os.environ.get("TTT_LORA_RANK", 8)) + ttt_lora_lr = float(os.environ.get("TTT_LORA_LR", 0.003)) + ttt_chunk_size = int(os.environ.get("TTT_CHUNK_SIZE", 256)) + ttt_eval_seq_len = int(os.environ.get("TTT_EVAL_SEQ_LEN", 2048)) + ttt_batch_size = int(os.environ.get("TTT_BATCH_SIZE", 32)) + +# ----------------------------- +# MUON OPTIMIZER +# ----------------------------- +# +# As borrowed from modded-nanogpt +# Background on Muon: https://kellerjordan.github.io/posts/muon/ + +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + # Orthogonalize a 2D update matrix with a fast Newton-Schulz iteration. + # Muon uses this to normalize matrix-shaped gradients before applying them. + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X + + +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, nesterov: bool = True): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, nesterov=nesterov), + ) + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + # Scale correction from Muon reference implementations. + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + + curr = 0 + for p in params: + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + + return loss + + +# ----------------------------- +# TOKENIZER-AGNOSTIC EVALUATION SETUP +# ----------------------------- +# +# It's common for small models have a large fraction of their parameters be embeddings, since the 2 * d_model * d_vocab vectors can be gigantic. +# Instead of locking the tokenizer, we let you bring your own and calculate our validation metrics on the average compression of the validation set. +# We calculate BPB (bits-per-byte) instead of validation loss, so we need methods to count the number of bits per token in the tokenizer. +# Note: Submissions that edit the tokenizer will be examined more carefully, since screwing this up might unjustly improve your score. + +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) + + +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + # The export pipeline writes the fixed first-50k-doc validation set to fineweb_val_*. + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] + + +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, +) -> tuple[float, float]: + # Validation computes two metrics: + # - val_loss: token cross-entropy (natural log) + # - val_bpb: tokenizer-agnostic compression metric used by the challenge + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < args.train_seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, TRAIN_SEQ_LEN={args.train_seq_len}" + ) + local_batch_seqs = local_batch_tokens // args.train_seq_len + total_seqs = (val_tokens.numel() - 1) // args.train_seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * args.train_seq_len + raw_end = batch_seq_end * args.train_seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, args.train_seq_len) + y = local[1:].reshape(-1, args.train_seq_len) + bigram_ids = None + if args.bigram_hash_buckets > 0: + bigram_ids = compute_bigram_hash(x, args.bigram_hash_buckets).to(device, non_blocking=True) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y, bigram_ids=bigram_ids).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) + + +def eval_val_sliding_window( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, +) -> tuple[float, float]: + """Batched sliding window eval: each token scored with near-full context.""" + stride = args.eval_stride + seq_len = args.eval_seq_len + batch_size = args.eval_batch_size + total_tokens = val_tokens.numel() - 1 + + # Each rank takes a slice + rank_start = (total_tokens * rank) // world_size + rank_end = (total_tokens * (rank + 1)) // world_size + + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + + # Pre-compute all window positions + windows: list[tuple[int, int, int, int]] = [] # (win_start, actual_len, score_start, score_end) + pos = rank_start + while pos < rank_end: + win_start = max(0, pos - (seq_len - stride)) + win_end = min(win_start + seq_len, total_tokens) + actual_len = win_end - win_start + score_start = max(0, pos - win_start) + score_end = min(actual_len, score_start + stride) + if score_end > score_start: + windows.append((win_start, actual_len, score_start, score_end)) + pos += stride + + num_batches = (len(windows) + batch_size - 1) // batch_size + if rank == 0: + print(f"sliding_window_eval: {len(windows)} windows, batch_size={batch_size}, {num_batches} batches") + + fwd_model = model.module if hasattr(model, 'module') else model + model.eval() + with torch.inference_mode(): + for bi in range(0, len(windows), batch_size): + batch_wins = windows[bi:bi + batch_size] + bsz = len(batch_wins) + + # Pad all windows to seq_len and stack into a batch + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + for j, (ws, al, _, _) in enumerate(batch_wins): + tokens = val_tokens[ws: ws + al + 1].to(device=device, dtype=torch.int64) + x_batch[j, :al] = tokens[:-1] + y_batch[j, :al] = tokens[1:] + + # Compute bigram hash for the batch + bigram_ids = None + if args.bigram_hash_buckets > 0: + bigram_ids = compute_bigram_hash(x_batch, args.bigram_hash_buckets) + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + logits = fwd_model.forward_logits(x_batch, bigram_ids) + + # Score each window's designated region + for j, (_, _, ss, se) in enumerate(batch_wins): + logits_scored = logits[j, ss:se] + targets_scored = y_batch[j, ss:se] + loss_per_token = F.cross_entropy( + logits_scored.float(), targets_scored, reduction='none' + ) + val_loss_sum += loss_per_token.to(torch.float64).sum() + val_token_count += float(se - ss) + + prev_ids = x_batch[j, ss:se] + tgt_ids = targets_scored + token_bytes = base_bytes_lut[tgt_ids].to(torch.float64) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(torch.float64) + val_byte_count += token_bytes.sum() + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + + val_loss = float(val_loss_sum.item() / val_token_count.item()) + val_bpb = float((val_loss_sum.item() / math.log(2.0)) / val_byte_count.item()) + model.train() + return val_loss, val_bpb + + +# ----------------------------- +# POST-TRAINING QUANTIZATION +# ----------------------------- +# +# It's silly to export our model, which is trained in bf16 and fp32, at that same precision. +# Instead, we get approximately the same model (with a small hit) by quantizing the model to int8 & zlib compressing. +# We can then decompress the model and run in higher precision for evaluation, after closing in under the size limit. + +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,skip_gate,skip_gates,smear_gate,smear_lambda,bigram_lambda,attn_gate,mlp_gate,cat_attn,cat_mlp", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +# FP16 passthrough for tied embedding (quantization errors compound through both embed + lm_head) +FP16_PASSTHROUGH_NAME_PATTERNS = ("tok_emb.weight",) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +# Int6: round int8 values to nearest INT6_STEP (4 = 64 levels = 6 bits effective) +INT6_STEP = 4 +# Int5: round int8 values to nearest INT5_STEP (8 = 32 levels = 5 bits effective) +# Low 3 bits are always zero → excellent zstd compression (~1.88×) +INT5_STEP = 8 +# MLP + bigram weights use int5 (more compressible, less sensitive to quantization) +INT5_NAME_PATTERNS = (".mlp.", "bigram_embed") + +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) + +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t + +def quantize_float_tensor(t: Tensor, quant_step: int = INT6_STEP) -> tuple[Tensor, Tensor]: + """Quantize a float tensor to int8 with optional sub-byte rounding. + + quant_step controls effective precision: + - 1 = full int8 (256 levels) + - 4 = int6 (64 levels, 6-bit effective) + - 8 = int5 (32 levels, 5-bit effective, best zstd compression) + """ + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8) + if quant_step > 1: + q = (torch.round(q.float() / quant_step) * quant_step).clamp(-127, 127).to(torch.int8) + return q.contiguous(), scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8) + if quant_step > 1: + q = (torch.round(q.float() / quant_step) * quant_step).clamp(-127, 127).to(torch.int8) + return q.contiguous(), scale + +class _FakeQuantSTE(torch.autograd.Function): + """Straight-through estimator for fake quantization during QAT.""" + @staticmethod + def forward(ctx, w: Tensor, quant_step: int) -> Tensor: + # Quantize then dequantize — simulates post-training quantization + t32 = w.float() + if t32.ndim == 2: + clip_abs = torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + clipped = t32.clamp(-clip_abs[:, None], clip_abs[:, None]) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127) + if quant_step > 1: + q = torch.round(q / quant_step) * quant_step + return (q * scale[:, None]).to(w.dtype) + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = max(clip_abs / 127.0, 1.0 / 127.0) + q = torch.clamp(torch.round(t32.clamp(-clip_abs, clip_abs) / scale), -127, 127) + if quant_step > 1: + q = torch.round(q / quant_step) * quant_step + return (q * scale).to(w.dtype) + + @staticmethod + def backward(ctx, grad_output: Tensor): + # Straight-through: pass gradient unchanged + return grad_output, None + + +def fake_quantize_model(model: "GPT") -> None: + """Apply in-place fake quantization to all large weight matrices (STE for QAT). + + Call this before each forward pass during QAT phase. Weights are quantized + then dequantized so the model experiences quantization noise during training, + while gradients flow through via straight-through estimator. + """ + for name, param in model.named_parameters(): + if not param.is_floating_point() or param.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + continue + if any(pat in name for pat in FP16_PASSTHROUGH_NAME_PATTERNS): + continue + quant_step = INT5_STEP if any(pat in name for pat in INT5_NAME_PATTERNS) else INT6_STEP + param.data = _FakeQuantSTE.apply(param.data, quant_step) + + +def quantize_state_dict_int8(state_dict: dict[str, Tensor], use_int6: bool = True): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", + "baseline_tensor_bytes", "int8_payload_bytes", "int5_tensors", "int6_tensors"), + 0, + ) + + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + + # FP16 passthrough for tied embedding — errors compound through both embed + lm_head + if any(pattern in name for pattern in FP16_PASSTHROUGH_NAME_PATTERNS): + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + kept = t.to(dtype=torch.float16).contiguous() + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + + # Small float tensors kept directly as fp16 or fp32 + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + + stats["num_float_tensors"] += 1 + # Mixed int5/int6: MLP weights get int5 (better compression), attention gets int6 + if use_int6 and any(pat in name for pat in INT5_NAME_PATTERNS): + quant_step = INT5_STEP + stats["int5_tensors"] += 1 + elif use_int6: + quant_step = INT6_STEP + stats["int6_tensors"] += 1 + else: + quant_step = 1 + q, s = quantize_float_tensor(t, quant_step=quant_step) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats + +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + # Broadcast the saved row scale back across trailing dimensions. + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + # Restore small tensors, undoing the temporary fp16 storage cast if needed. + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out + + +# ----------------------------- +# DATA LOADING +# ----------------------------- + +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) + + +def compute_bigram_hash(tokens: Tensor, num_buckets: int) -> Tensor: + """Compute bigram hash IDs from token sequence. Position 0 maps to bucket 0.""" + x = tokens.to(torch.int32) + out = torch.zeros_like(x) + out[..., 0] = 0 # first position has no bigram + if x.shape[-1] > 1: + out[..., 1:] = torch.bitwise_xor(36313 * x[..., 1:], 27191 * x[..., :-1]) % (num_buckets - 1) + return out + + +class DistributedTokenLoader: + # Each call consumes a contiguous chunk from the shared token stream, then slices out + # one disjoint span per rank. The extra "+1" token lets us build (x, y) by shifting. + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device, bigram_hash_buckets: int = 0): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + self.bigram_hash_buckets = bigram_hash_buckets + + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor, Tensor | None]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + bigram_ids = None + if self.bigram_hash_buckets > 0: + bigram_ids = compute_bigram_hash(x, self.bigram_hash_buckets).to(self.device, non_blocking=True) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True), bigram_ids + +# ----------------------------- +# TRANSFORMER MODULES +# ----------------------------- + +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) + + +class CastedLinear(nn.Linear): + # Keep weights in fp32 for optimizer/state quality, cast at matmul time for bf16 compute. + def forward(self, x: Tensor) -> Tensor: + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, self.weight.to(x.dtype), bias) + + +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + # Keep small/control parameters in fp32 even when the model body runs in bf16. + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() + + +class Rotary(nn.Module): + # Caches cos/sin tables per sequence length on the current device. + def __init__(self, dim: int, base: float = 10000.0): + super().__init__() + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype) + freqs = torch.outer(t, self.inv_freq.to(device)) + self._cos_cached = freqs.cos()[None, None, :, :] + self._sin_cached = freqs.sin()[None, None, :, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) + + +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor) -> Tensor: + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + + +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + paired: bool = False, + use_xsa: bool = False, + rope_frac: float = 1.0, + gated_attention: bool = False, + value_residual: bool = False, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + self.paired = paired + self.use_xsa = use_xsa + if paired and (num_heads % 2 != 0 or num_kv_heads % 2 != 0): + raise ValueError("Paired heads requires even num_heads and num_kv_heads") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + # Gated Attention: per-head sigmoid gate, initialized near-open (sigmoid(4) ≈ 0.98) + if gated_attention: + self.attn_gate = nn.Parameter(torch.full((num_heads,), 4.0, dtype=torch.float32)) + # Value Residual: blend V from first layer into this layer's V + if value_residual: + self.vr_alpha = nn.Parameter(torch.ones(1, dtype=torch.float32)) + self.vr_beta = nn.Parameter(torch.zeros(1, dtype=torch.float32)) + # Partial RoPE: only apply rotary embeddings to a fraction of head dims + self.rope_dim = max(2, (int(self.head_dim * rope_frac) // 2) * 2) # round down to even + self.rotary = Rotary(self.rope_dim, base=rope_base) + + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: (B, T, H, D) attention output per head + v: (B, T, Hkv, D) value vectors (before GQA expansion) + Returns: (B, T, H, D) with self-value component removed + """ + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) + vn = F.normalize(v, dim=-1).unsqueeze(-2) # (B, T, Hkv, 1, D) + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + + def forward(self, x: Tensor, q_delta=None, v_delta=None, v_first=None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x) + (q_delta if q_delta is not None else 0) + k = self.c_k(x) + v = self.c_v(x) + (v_delta if v_delta is not None else 0) + + # Value Residual: cache raw V for first layer, blend with first layer's V for subsequent + self._v_for_residual = v # (B, T, kv_dim) — GPT grabs this from first layer + if v_first is not None and hasattr(self, 'vr_alpha'): + v = self.vr_alpha.to(dtype=v.dtype) * v + self.vr_beta.to(dtype=v.dtype) * v_first + + if self.paired: + # Paired Head Attention: merge adjacent heads, interleave into doubled sequence. + ph = self.num_heads // 2 + pkv = self.num_kv_heads // 2 + seq2 = seqlen * 2 + q = q.reshape(bsz, seqlen, ph, self.head_dim * 2) + q = q.reshape(bsz, seq2, ph, self.head_dim).transpose(1, 2) + k = k.reshape(bsz, seqlen, pkv, self.head_dim * 2) + k = k.reshape(bsz, seq2, pkv, self.head_dim).transpose(1, 2) + v = v.reshape(bsz, seq2, pkv, self.head_dim).transpose(1, 2) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seq2, x.device, q.dtype) + rd = self.rope_dim + if rd < self.head_dim: + q = torch.cat([apply_rotary_emb(q[..., :rd], cos, sin), q[..., rd:]], dim=-1) + k = torch.cat([apply_rotary_emb(k[..., :rd], cos, sin), k[..., rd:]], dim=-1) + else: + q = apply_rotary_emb(q, cos, sin) + k = apply_rotary_emb(k, cos, sin) + paired_gain = self.q_gain.reshape(-1, 2).mean(dim=1) + q = q * paired_gain.to(dtype=q.dtype)[None, :, None, None] + if pkv != ph: + rep = ph // pkv + k = k[:, :, None, :, :].expand(bsz, pkv, rep, seq2, self.head_dim).reshape(bsz, ph, seq2, self.head_dim) + v = v[:, :, None, :, :].expand(bsz, pkv, rep, seq2, self.head_dim).reshape(bsz, ph, seq2, self.head_dim) + y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, is_causal=True) + # Gated Attention + if hasattr(self, 'attn_gate'): + y = y * torch.sigmoid(self.attn_gate).to(dtype=y.dtype)[None, :, None, None] + y = y.transpose(1, 2).contiguous().reshape(bsz, seqlen, dim) + else: + q = q.reshape(bsz, seqlen, self.num_heads, self.head_dim).transpose(1, 2) + k = k.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim).transpose(1, 2) + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim).transpose(1, 2) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + rd = self.rope_dim + if rd < self.head_dim: + q = torch.cat([apply_rotary_emb(q[..., :rd], cos, sin), q[..., rd:]], dim=-1) + k = torch.cat([apply_rotary_emb(k[..., :rd], cos, sin), k[..., rd:]], dim=-1) + else: + q = apply_rotary_emb(q, cos, sin) + k = apply_rotary_emb(k, cos, sin) + q = q * self.q_gain.to(dtype=q.dtype)[None, :, None, None] + v_pre_expand = v # keep unexpanded values for XSA + if self.num_kv_heads != self.num_heads: + rep = self.num_heads // self.num_kv_heads + k = k[:, :, None, :, :].expand(bsz, self.num_kv_heads, rep, seqlen, self.head_dim).reshape(bsz, self.num_heads, seqlen, self.head_dim) + v = v[:, :, None, :, :].expand(bsz, self.num_kv_heads, rep, seqlen, self.head_dim).reshape(bsz, self.num_heads, seqlen, self.head_dim) + y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, is_causal=True) + # Gated Attention + if hasattr(self, 'attn_gate'): + y = y * torch.sigmoid(self.attn_gate).to(dtype=y.dtype)[None, :, None, None] + if self.use_xsa: + y = self._xsa_efficient(y.transpose(1, 2), v_pre_expand.transpose(1, 2)) + y = y.contiguous().reshape(bsz, seqlen, dim) + else: + y = y.transpose(1, 2).contiguous().reshape(bsz, seqlen, dim) + return self.proj(y) + + +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, use_swiglu: bool = False): + super().__init__() + self.use_swiglu = use_swiglu + if use_swiglu: + # SwiGLU: 3 matrices but reduce hidden to match 2-matrix param count + # 3 * dim * swiglu_hidden = 2 * dim * (mlp_mult * dim) + # swiglu_hidden = 2 * mlp_mult * dim / 3 + swiglu_hidden = (2 * mlp_mult * dim) // 3 + self.gate = CastedLinear(dim, swiglu_hidden, bias=False) + self.fc = CastedLinear(dim, swiglu_hidden, bias=False) + self.proj = CastedLinear(swiglu_hidden, dim, bias=False) + else: + hidden = mlp_mult * dim + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + + def forward(self, x: Tensor) -> Tensor: + if self.use_swiglu: + return self.proj(F.silu(self.gate(x)) * self.fc(x)) + x = torch.relu(self.fc(x)) + return self.proj(x.square()) + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + paired: bool = False, + use_xsa: bool = False, + rope_frac: float = 1.0, + gated_attention: bool = False, + value_residual: bool = False, + use_swiglu: bool = False, + layer_idx: int = 0, + ln_scale: bool = False, + catalytic: bool = False, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init, paired=paired, use_xsa=use_xsa, rope_frac=rope_frac, gated_attention=gated_attention, value_residual=value_residual) + self.mlp = MLP(dim, mlp_mult, use_swiglu=use_swiglu) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + # LN Scale: damp norm inputs by 1/sqrt(layer_idx+1) for deeper layers + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + # Catalytic Residuals: learned per-dim gates on attn/MLP outputs (init=1.0, no-op start) + if catalytic: + self.cat_attn = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.cat_mlp = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + + def forward(self, x: Tensor, x0: Tensor, q_delta_fn=None, v_delta_fn=None, v_first=None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + s = self.ln_scale_factor + n = self.attn_norm(x) * s + qd = q_delta_fn(n) if q_delta_fn is not None else None + vd = v_delta_fn(n) if v_delta_fn is not None else None + attn_out = self.attn(n, qd, vd, v_first=v_first) + cat_a = self.cat_attn.to(dtype=x.dtype)[None, None, :] if hasattr(self, 'cat_attn') else 1.0 + x = x + self.attn_scale.to(dtype=x.dtype)[None, None, :] * cat_a * attn_out + cat_m = self.cat_mlp.to(dtype=x.dtype)[None, None, :] if hasattr(self, 'cat_mlp') else 1.0 + x = x + self.mlp_scale.to(dtype=x.dtype)[None, None, :] * cat_m * self.mlp(self.mlp_norm(x) * s) + return x + + +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + smeargate_dim: int = 12, + bigram_hash_buckets: int = 4096, + bigram_embed_dim: int = 128, + paired_head_layers: str = "", + xsa_last_n: int = 0, + rope_frac: float = 1.0, + backout_enabled: bool = False, + use_swiglu: bool = False, + value_residual: bool = False, + gated_attention: bool = False, + ln_scale: bool = False, + catalytic: bool = False, + ): + super().__init__() + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.tok_emb = nn.Embedding(vocab_size, model_dim) + + # SmearGate: blend each token with the previous token's embedding + self.smear_gate = nn.Linear(smeargate_dim, 1, bias=False) + self.smear_lambda = nn.Parameter(torch.zeros(1)) + + # BigramHash: token-pair context via hash table + self.bigram_embed = nn.Embedding(bigram_hash_buckets, bigram_embed_dim) + self.bigram_proj = CastedLinear(bigram_embed_dim, model_dim, bias=False) + self.bigram_lambdas = nn.Parameter(0.05 * torch.ones(num_layers, dtype=torch.float32)) + + # Parse paired head layers + paired_set = set() + if paired_head_layers: + paired_set = {int(x.strip()) for x in paired_head_layers.split(",") if x.strip()} + + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.skip_gates = nn.Parameter(torch.zeros(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.value_residual = value_residual + xsa_start = num_layers - xsa_last_n if xsa_last_n > 0 else num_layers + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + paired=(i in paired_set), + use_xsa=(i >= xsa_start), + rope_frac=rope_frac, + gated_attention=gated_attention, + value_residual=(value_residual and i > 0), # first layer has no v_first + use_swiglu=use_swiglu, + layer_idx=i, + ln_scale=ln_scale, + catalytic=catalytic, + ) + for i in range(num_layers) + ] + ) + self.final_norm = RMSNorm() + # Backout Connection: subtract learned fraction of midpoint hidden state + self.backout_enabled = backout_enabled + if backout_enabled: + self.backout_lambda = nn.Parameter(torch.zeros(1)) + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self._init_weights() + + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + # Orthogonal init for linear layers, zero init for output projections + for module in self.modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and min(module.weight.shape) > 1: + nn.init.orthogonal_(module.weight) + # SmearGate starts at zero (no blending initially) + nn.init.zeros_(self.smear_gate.weight) + # BigramHash starts at zero + nn.init.zeros_(self.bigram_embed.weight) + # muP-style output scaling for attention and MLP projections + scale = 1.0 / math.sqrt(2 * len(self.blocks)) + for block in self.blocks: + if hasattr(block.attn, 'proj'): + block.attn.proj.weight.data *= scale + if hasattr(block.mlp, 'proj'): + block.mlp.proj.weight.data *= scale + + def _compute_logits(self, input_ids: Tensor, bigram_ids: Tensor | None = None, lora=None) -> Tensor: + """Shared forward body: embed → blocks → logits.""" + x = self.tok_emb(input_ids) + # SmearGate: blend with previous token embedding + smear_gate_out = self.smear_lambda * torch.sigmoid(self.smear_gate(x[:, 1:, :self.smear_gate.in_features])) + x = torch.cat([x[:, :1], x[:, 1:] + smear_gate_out * x[:, :-1]], dim=1) + x = F.rms_norm(x, (x.size(-1),)) + x0 = x + # BigramHash: per-layer injection of token-pair context + if bigram_ids is not None: + bigram_emb = self.bigram_proj(self.bigram_embed(bigram_ids)) + else: + bigram_emb = None + skips: list[Tensor] = [] + v_first = None # Value Residual: V from first attention layer + # First half stores skips; second half reuses them in reverse order. + for i in range(self.num_encoder_layers): + if bigram_emb is not None: + x = x + self.bigram_lambdas[i].to(dtype=x.dtype) * bigram_emb + qd = lora.q_loras[i] if lora else None + vd = lora.v_loras[i] if lora else None + x = self.blocks[i](x, x0, qd, vd, v_first=v_first) + if i == 0 and self.value_residual: + v_first = self.blocks[0].attn._v_for_residual + skips.append(x) + h_mid = x # capture midpoint for backout connection + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + skip = skips.pop() + gate = torch.sigmoid(self.skip_gates[i].to(dtype=x.dtype)) + scaled_skip = self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skip + x = gate[None, None, :] * x + (1.0 - gate[None, None, :]) * scaled_skip + if bigram_emb is not None: + x = x + self.bigram_lambdas[bi].to(dtype=x.dtype) * bigram_emb + qd = lora.q_loras[bi] if lora else None + vd = lora.v_loras[bi] if lora else None + x = self.blocks[bi](x, x0, qd, vd, v_first=v_first) + # Backout Connection: subtract learned fraction of midpoint hidden state + if self.backout_enabled: + x = x - self.backout_lambda.to(dtype=x.dtype) * h_mid + x = self.final_norm(x) + if self.tie_embeddings: + logits = F.linear(x, self.tok_emb.weight) + else: + logits = self.lm_head(x) + logits = logits + (lora.lm_head_lora(x) if lora else 0) + return self.logit_softcap * torch.tanh(logits / self.logit_softcap) + + def forward(self, input_ids: Tensor, target_ids: Tensor, lora=None, bigram_ids: Tensor | None = None) -> Tensor: + logits = self._compute_logits(input_ids, bigram_ids, lora) + if lora: + bsz, sl, V = logits.shape + return F.cross_entropy( + logits.float().reshape(-1, V), target_ids.reshape(-1), reduction="none").reshape(bsz, sl) + return F.cross_entropy(logits.float().reshape(-1, logits.size(-1)), target_ids.reshape(-1), reduction="mean") + + def forward_logits(self, input_ids: Tensor, bigram_ids: Tensor | None = None) -> Tensor: + """Return raw logits (for sliding window eval).""" + return self._compute_logits(input_ids, bigram_ids) + + +# ----------------------------- +# TEST-TIME TRAINING (LoRA) +# ----------------------------- +# +# At evaluation time, we adapt per-document low-rank adapters on the validation data. +# Each document gets its own adapter, so there is no inter-document dependency. + +BOS_ID = 1 + +class BatchedLinearLoRA(nn.Module): + """LoRA for a linear layer, with independent weights per batch element. + Computes x @ Aᵀ @ Bᵀ = x @ (BA)ᵀ, i.e. the LoRA delta is ΔW = BA.""" + def __init__(self, bsz: int, in_features: int, out_features: int, rank: int): + super().__init__() + self.in_features = in_features + self.A = nn.Parameter(torch.empty(bsz, rank, in_features)) # down-projection + self.B = nn.Parameter(torch.zeros(bsz, out_features, rank)) # up-projection + self.reset() + + def forward(self, x: Tensor) -> Tensor: + return (x @ self.A.transpose(1, 2)) @ self.B.transpose(1, 2) # (bsz, T, out) + + def reset(self) -> None: + bound = 1.0 / math.sqrt(self.in_features) + with torch.no_grad(): + self.A.uniform_(-bound, bound) # kaiming-uniform + self.B.zero_() + +class BatchedTTTLoRA(nn.Module): + """All LoRA adapters for one batch: LM head and Q/V per block.""" + def __init__(self, bsz: int, model: GPT, rank: int): + super().__init__() + dim = model.tok_emb.embedding_dim + vocab = model.tok_emb.num_embeddings + self.lm_head_lora = BatchedLinearLoRA(bsz, dim, vocab, rank) + self.q_loras = nn.ModuleList() + self.v_loras = nn.ModuleList() + for block in model.blocks: + self.q_loras.append(BatchedLinearLoRA(bsz, dim, block.attn.c_q.weight.shape[0], rank)) + self.v_loras.append(BatchedLinearLoRA(bsz, dim, block.attn.c_v.weight.shape[0], rank)) + + def reset(self) -> None: + for m in self.modules(): + if isinstance(m, BatchedLinearLoRA): + m.reset() + +def _reset_ttt_optimizer(opt): + for group in opt.param_groups: + for p in group['params']: + s = opt.state.get(p) + if not s: # Fresh state. + continue + s['exp_avg'].zero_() + s['exp_avg_sq'].zero_() + s['step'].fill_(0) + +def _build_ttt_optimizer(lora, args: Hyperparameters): + return torch.optim.Adam(lora.parameters(), lr=args.ttt_lora_lr, betas=(args.beta1, args.beta2), eps=1e-10) + +def _find_docs(all_tokens: Tensor, include_next_bos: bool = True) -> list[tuple[int, int]]: + """Return (start_offset, length) for each document, identified by BOS boundaries. + + If include_next_bos is True, include next document's BOS (to match continuous-stream + eval token count exactly). + """ + bos_positions = (all_tokens == BOS_ID).nonzero(as_tuple=True)[0].numpy() + docs = [] + for i in range(len(bos_positions)): + start = int(bos_positions[i]) + end = int(bos_positions[i + 1]) if i + 1 < len(bos_positions) else all_tokens.numel() + if include_next_bos and i + 1 < len(bos_positions): + end += 1 + assert end - start >= 2 + docs.append((start, end - start)) + return docs + +def _compute_chunk_window(ci: int, pred_len: int, num_chunks: int, chunk_size: int, eval_seq_len: int): + """Return (win_start, win_len, chunk_offset, chunk_len) for chunk `ci` of a doc.""" + chunk_start = ci * chunk_size + chunk_end = pred_len if ci == num_chunks - 1 else (ci + 1) * chunk_size + win_start = max(0, chunk_end - eval_seq_len) + win_len = chunk_end - win_start + chunk_offset = chunk_start - win_start + chunk_len = chunk_end - chunk_start + return win_start, win_len, chunk_offset, chunk_len + +def _accumulate_bpb( + ptl: Tensor, x: Tensor, y: Tensor, + batch_i: int, chunk_offset: int, chunk_len: int, + base_bytes_lut: Tensor, has_leading_space_lut: Tensor, is_boundary_token_lut: Tensor, + loss_sum: Tensor, byte_sum: Tensor, token_count: Tensor, +): + """Add one doc-chunk's contribution to the running BPB accumulators.""" + lbl = ptl[batch_i, chunk_offset:chunk_offset + chunk_len].to(torch.float64) + prev = x[batch_i, chunk_offset:chunk_offset + chunk_len] + tgt = y[batch_i, chunk_offset:chunk_offset + chunk_len] + tok_bytes = base_bytes_lut[tgt].to(torch.float64) + tok_bytes += has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev] + loss_sum += lbl.sum() + byte_sum += tok_bytes.sum() + token_count += chunk_len + +def eval_val_ttt_sgd( + args: Hyperparameters, + model: GPT, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, +) -> tuple[float, float]: + """Full AdamW test-time training: fine-tune entire model on val tokens, then eval. + + From PR #442: AdamW TTT gives -0.019 BPB over SGD, 40% faster convergence. + 10 epochs of AdamW on validation token stream, all blocks unfrozen. + """ + model.train() + + # Freeze first N blocks + freeze_n = args.ttt_sgd_freeze_layers + for i, block in enumerate(model.blocks): + if i < freeze_n: + for p in block.parameters(): + p.requires_grad_(False) + else: + for p in block.parameters(): + p.requires_grad_(True) + # Freeze embedding (stable, not worth adapting) + model.tok_emb.weight.requires_grad_(False) + if hasattr(model, 'bigram_embed'): + model.bigram_embed.weight.requires_grad_(False) + if hasattr(model, 'bigram_proj'): + for p in model.bigram_proj.parameters(): + p.requires_grad_(False) + if hasattr(model, 'smear_gate'): + for p in model.smear_gate.parameters(): + p.requires_grad_(False) + model.smear_lambda.requires_grad_(False) + + trainable = [p for p in model.parameters() if p.requires_grad] + if rank == 0: + print(f"TTT AdamW: {len(trainable)} trainable param groups, " + f"{sum(p.numel() for p in trainable):,} params, " + f"freezing first {freeze_n} blocks") + + optimizer = torch.optim.AdamW(trainable, lr=args.ttt_sgd_lr, weight_decay=0.0) + # Cosine TTT decay: anneal LR over epochs (PR #481: -0.006 BPB) + ttt_scheduler = None + if args.ttt_cosine_decay: + ttt_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=args.ttt_sgd_epochs, eta_min=args.ttt_sgd_lr * 0.1 + ) + + seq_len = args.train_seq_len + batch_seqs = args.ttt_sgd_batch_seqs + # Use all val tokens visible to this rank + n_val = len(val_tokens) + rank_start = (n_val * rank) // world_size + rank_end = (n_val * (rank + 1)) // world_size + rank_tokens = val_tokens[rank_start:rank_end].to(device) + + # Pre-slice all chunks for batching + starts = list(range(0, len(rank_tokens) - seq_len, seq_len)) + n_total_chunks = len(starts) + + for epoch in range(args.ttt_sgd_epochs): + epoch_loss = 0.0 + n_steps = 0 + for batch_start in range(0, n_total_chunks, batch_seqs): + batch_starts = starts[batch_start:batch_start + batch_seqs] + bs = len(batch_starts) + + # Build batched tensors [bs, seq_len] + chunks = torch.stack([rank_tokens[s:s + seq_len].long() for s in batch_starts]) + targets = torch.stack([rank_tokens[s + 1:s + seq_len + 1].long() for s in batch_starts]) + bigram_ids = compute_bigram_hash(chunks, args.bigram_hash_buckets) if args.bigram_hash_buckets > 0 else None + + optimizer.zero_grad() + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(chunks, targets, bigram_ids=bigram_ids) + loss.backward() + + # DDP gradient sync across ranks (PR #462: all-reduce before step) + if world_size > 1: + for p in trainable: + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + + # Gradient clipping (PR #462: clip_grad_norm_ at 1.0) + torch.nn.utils.clip_grad_norm_(trainable, 1.0) + + optimizer.step() + epoch_loss += loss.item() + n_steps += 1 + + if n_steps > 0 and rank == 0: + print(f"TTT AdamW epoch {epoch + 1}/{args.ttt_sgd_epochs}: " + f"loss={epoch_loss / n_steps:.4f} steps={n_steps} (bs={batch_seqs})") + if ttt_scheduler is not None: + ttt_scheduler.step() + + # Unfreeze all params and switch to eval + for p in model.parameters(): + p.requires_grad_(False) + model.eval() + + # Now run sliding window eval with the adapted model + return eval_val_sliding_window( + args, model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + + +def eval_val_ttt_lora( + args: Hyperparameters, + base_model: GPT, + rank: int, + world_size: int, + device: torch.device, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, +) -> tuple[float, float]: + """Evaluate with batched LoRA test-time training. Returns (val_loss, val_bpb).""" + # Load validation tokens and find document boundaries + files = sorted(glob.glob(args.val_files)) + all_tokens = torch.cat([load_data_shard(Path(f)) for f in files]) + docs = _find_docs(all_tokens) + + # Each rank takes a contiguous slice of documents + rank_docs = docs[(len(docs) * rank) // world_size : (len(docs) * (rank + 1)) // world_size] + chunk_size = args.ttt_chunk_size + eval_seq_len = args.ttt_eval_seq_len + batch_size = args.ttt_batch_size + lora_rank = args.ttt_lora_rank + + rank_docs.sort(key=lambda d: (d[1] - 2) // chunk_size) + + base_model.eval() + for p in base_model.parameters(): + p.requires_grad_(False) + + lora = BatchedTTTLoRA(batch_size, base_model, lora_rank).to(device) + opt = _build_ttt_optimizer(lora, args) + + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + byte_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + + for bi in range(0, len(rank_docs), batch_size): + batch = rank_docs[bi:bi + batch_size] + bsz = len(batch) + + if bsz == batch_size: + cur_lora, cur_opt = lora, opt + cur_lora.reset() + _reset_ttt_optimizer(cur_opt) + else: + cur_lora = BatchedTTTLoRA(bsz, base_model, lora_rank).to(device) + cur_opt = _build_ttt_optimizer(cur_lora, args) + + pred_lens = [doc_len - 1 for _, doc_len in batch] + num_chunks = [(pl + chunk_size - 1) // chunk_size for pl in pred_lens] + max_nc = max(num_chunks) + + for ci in range(max_nc): + chunk_stats = _compute_chunk_window(ci, (ci + 1) * chunk_size, ci + 1, chunk_size, eval_seq_len) + context_size, chunk_offset = chunk_stats[1], chunk_stats[2] + + active = [ci < nc for nc in num_chunks] + needs_train = any(ci < nc - 1 for nc in num_chunks) + + x = torch.zeros(bsz, context_size, dtype=torch.int64, device=device) + y = torch.zeros(bsz, context_size, dtype=torch.int64, device=device) + doc_info = [] # (chunk_offset, chunk_len) per doc + for b in range(bsz): + if not active[b]: + doc_info.append((0, 0)) + continue + ds, dl = batch[b] + ws, wl, co, cl = _compute_chunk_window(ci, pred_lens[b], num_chunks[b], chunk_size, eval_seq_len) + chunk = all_tokens[ds + ws: ds + ws + wl + 1] + toks = chunk.to(dtype=torch.int64, device=device) + x[b, :wl] = toks[:-1] + y[b, :wl] = toks[1:] + doc_info.append((co, cl)) + + # Compute bigram hash IDs for this batch + bigram_ids = None + if args.bigram_hash_buckets > 0: + bigram_ids = compute_bigram_hash(x, args.bigram_hash_buckets) + + # Forward pass (keep grad graph alive only when we need to train) + if needs_train: + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + ptl = base_model(x, y, lora=cur_lora, bigram_ids=bigram_ids) + else: + with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16): + ptl = base_model(x, y, lora=cur_lora, bigram_ids=bigram_ids) + + # Score: accumulate loss and byte counts for BPB (before training on chunk) + with torch.no_grad(): + for b in range(bsz): + if not active[b]: + continue + co, cl = doc_info[b] + _accumulate_bpb( + ptl, x, y, b, co, cl, base_bytes_lut, has_leading_space_lut, + is_boundary_token_lut, loss_sum, byte_sum, token_count) + + # Train: one Adam step on the LoRA params using this chunk's loss + if needs_train: + mask = torch.tensor([float(ci < num_chunks[b] - 1) for b in range(bsz)], device=device) + per_doc = ptl[:, chunk_offset:chunk_offset + chunk_size].mean(dim=-1) + cur_opt.zero_grad() + (per_doc * mask).sum().backward() + cur_opt.step() + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + + val_loss = float(loss_sum.item() / token_count.item()) + val_bpb = float((loss_sum.item() / math.log(2.0)) / byte_sum.item()) + return val_loss, val_bpb + +# ----------------------------- +# TRAINING +# ----------------------------- + +def main() -> None: + global zeropower_via_newtonschulz5 + + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + + # ----------------------------- + # DISTRIBUTED + CUDA SETUP + # ----------------------------- + + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + + # Fast math knobs + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + + # ----------------------------- + # TOKENIZER + VALIDATION METRIC SETUP + # ----------------------------- + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + val_tokens = load_validation_tokens(args.val_files, args.train_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + + # ----------------------------- + # MODEL + OPTIMIZER SETUP + # ----------------------------- + + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + smeargate_dim=args.smeargate_dim, + bigram_hash_buckets=args.bigram_hash_buckets, + bigram_embed_dim=args.bigram_embed_dim, + paired_head_layers=args.paired_head_layers, + xsa_last_n=args.xsa_last_n, + rope_frac=args.rope_frac, + backout_enabled=args.backout_enabled, + use_swiglu=args.use_swiglu, + value_residual=args.value_residual, + gated_attention=args.gated_attention, + ln_scale=args.ln_scale, + catalytic=args.catalytic, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + if isinstance(module, Rotary): + module.inv_freq.data = module.inv_freq.data.float() + restore_low_dim_params_to_fp32(base_model) + compiled_model = torch.compile(base_model, dynamic=False, fullgraph=True) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + + # Optimizer split: + # - token embedding (Adam) uses EMBED_LR + # - untied lm_head (Adam) uses HEAD_LR + # - matrix params in transformer blocks use MATRIX_LR via Muon + # - vectors/scalars use SCALAR_LR via Adam + # - decoder layers get DECODER_LR_MULT applied (PR #462) + num_enc = base_model.num_encoder_layers + matrix_params_enc: list[nn.Parameter] = [] + matrix_params_dec: list[nn.Parameter] = [] + scalar_params_enc: list[nn.Parameter] = [] + scalar_params_dec: list[nn.Parameter] = [] + for block_idx, block in enumerate(base_model.blocks): + is_decoder = block_idx >= num_enc + for name, p in block.named_parameters(): + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS): + (matrix_params_dec if is_decoder else matrix_params_enc).append(p) + else: + (scalar_params_dec if is_decoder else scalar_params_enc).append(p) + # Other scalar params (not in blocks) + other_scalar_params: list[nn.Parameter] = [] + if base_model.skip_weights.numel() > 0: + other_scalar_params.append(base_model.skip_weights) + if hasattr(base_model, 'skip_gates') and base_model.skip_gates.numel() > 0: + other_scalar_params.append(base_model.skip_gates) + # SmearGate + BigramHash params — must be included in optimizer groups + other_scalar_params.append(base_model.smear_lambda) + other_scalar_params.append(base_model.bigram_lambdas) + # SmearGate linear (12→1, tiny) — treat as scalar-group param + for p in base_model.smear_gate.parameters(): + other_scalar_params.append(p) + # BigramHash proj is a matrix (128→model_dim), add to Muon encoder group + matrix_params_enc.append(base_model.bigram_proj.weight) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + optimizer_tok = torch.optim.Adam( + [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + matrix_lr_enc = args.matrix_lr + matrix_lr_dec = args.matrix_lr * args.decoder_lr_mult + optimizer_muon = Muon( + [ + {"params": matrix_params_enc, "lr": matrix_lr_enc, "base_lr": matrix_lr_enc}, + {"params": matrix_params_dec, "lr": matrix_lr_dec, "base_lr": matrix_lr_dec}, + ] if matrix_params_dec else matrix_params_enc, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + ) + if not matrix_params_dec: + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + scalar_lr_enc = args.scalar_lr + scalar_lr_dec = args.scalar_lr * args.decoder_lr_mult + optimizer_scalar = torch.optim.Adam( + [ + {"params": scalar_params_enc + other_scalar_params, "lr": scalar_lr_enc, "base_lr": scalar_lr_enc}, + {"params": scalar_params_dec, "lr": scalar_lr_dec, "base_lr": scalar_lr_dec}, + ] if scalar_params_dec else [{"params": scalar_params_enc + other_scalar_params, "lr": scalar_lr_enc, "base_lr": scalar_lr_enc}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + # BigramHash embedding table — Adam with scalar LR (starts from zero, needs gentle LR) + optimizer_bigram = torch.optim.Adam( + [{"params": [base_model.bigram_embed.weight], "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar, optimizer_bigram] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + + n_params = sum(p.numel() for p in base_model.parameters()) + log0(f"model_params:{n_params}") + log0(f"world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0("sdp_backends:cudnn=False flash=True mem_efficient=False math=False") + log0(f"attention_mode:gqa num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads}") + log0( + f"tie_embeddings:{args.tie_embeddings} embed_lr:{token_lr} " + f"head_lr:{args.head_lr if base_model.lm_head is not None else 0.0} " + f"matrix_lr:{args.matrix_lr} scalar_lr:{args.scalar_lr}" + ) + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"seed:{args.seed}") + + # ----------------------------- + # DATA LOADER & MODEL WARMUP + # ----------------------------- + + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device, bigram_hash_buckets=args.bigram_hash_buckets) + + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + + # Warmup primes the compiled forward/backward/optimizer paths, then we restore the + # initial weights/optimizer state so measured training starts from the true init. + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y, bigram_ids = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y, bigram_ids=bigram_ids) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device, bigram_hash_buckets=args.bigram_hash_buckets) + + # ----------------------------- + # MAIN TRAINING LOOP + # ----------------------------- + + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + + # SWA: accumulate weight averages for smoother, more quantization-friendly weights + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + + # EMA: exponential moving average for smoother weights (much better than SWA for quantization) + ema_state: dict[str, Tensor] | None = None + if args.ema_enabled: + ema_state = {k: v.clone().float() for k, v in base_model.state_dict().items()} + log0(f"EMA enabled: decay={args.ema_decay}") + + qat_logged = False + + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + + # Late QAT: fake-quantize weights during last N% of training (STE) + qat_active = False + if args.qat_enabled: + elapsed_frac = elapsed_ms / max(max_wallclock_ms or 600000, 1) + if elapsed_frac >= args.qat_start_frac: + qat_active = True + if not qat_logged: + log0(f"QAT activated at step {step} (elapsed_frac={elapsed_frac:.3f})") + qat_logged = True + fake_quantize_model(base_model) + + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y, bigram_ids = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y, bigram_ids=bigram_ids) + train_loss += loss.detach() + (loss * grad_scale).backward() + train_loss /= grad_accum_steps + + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + + # Decoupled weight decay for Muon matrix params (applied after optimizer step) + if args.muon_weight_decay > 0: + with torch.no_grad(): + for group in optimizer_muon.param_groups: + for p in group["params"]: + p.mul_(1.0 - args.muon_weight_decay * group["lr"]) + + # SWA: accumulate checkpoint averages from swa_start_frac of training onward + if args.swa_enabled: + elapsed_frac = (training_time_ms + 1000.0 * (time.perf_counter() - t0)) / max(max_wallclock_ms or 600000, 1) + if elapsed_frac >= args.swa_start_frac and step % args.swa_interval == 0: + with torch.no_grad(): + sd = base_model.state_dict() + if swa_state is None: + swa_state = {k: v.clone().float() for k, v in sd.items()} + swa_count = 1 + else: + swa_count += 1 + for k in swa_state: + swa_state[k] += sd[k].float() + + # EMA: exponential moving average update every step + if ema_state is not None: + with torch.no_grad(): + d = args.ema_decay + for k, v in base_model.state_dict().items(): + ema_state[k].mul_(d).add_(v.float(), alpha=1.0 - d) + + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + + # Needed to sync whether we've reached the wallclock cap. + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + + # Apply EMA or SWA averaged weights + if ema_state is not None: + log0(f"EMA: loading exponential moving average weights (decay={args.ema_decay})") + with torch.no_grad(): + ema_sd = {k: v.to(dtype=base_model.state_dict()[k].dtype) for k, v in ema_state.items()} + base_model.load_state_dict(ema_sd, strict=True) + elif swa_state is not None and swa_count > 1: + log0(f"SWA: averaging {swa_count} checkpoints") + with torch.no_grad(): + avg_sd = {k: (v / swa_count).to(dtype=base_model.state_dict()[k].dtype) for k, v in swa_state.items()} + base_model.load_state_dict(avg_sd, strict=True) + else: + log0("No weight averaging applied, using final weights") + + # ----------------------------- + # SERIALIZATION + ROUNDTRIP VALIDATION + # ----------------------------- + + if master_process: + torch.save(base_model.state_dict(), "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + log0(f"Total submission size: {model_bytes + code_bytes} bytes") + + use_int6 = args.quant_bits == 6 + quant_obj, quant_stats = quantize_state_dict_int8(base_model.state_dict(), use_int6=use_int6) + quant_buf = io.BytesIO() + torch.save(quant_obj, quant_buf) + quant_raw = quant_buf.getvalue() + # Use zstd-22 for better compression (especially with int6's restricted value range) + cctx = zstd.ZstdCompressor(level=22) + quant_blob = cctx.compress(quant_raw) + quant_raw_bytes = len(quant_raw) + if master_process: + with open("final_model.int8.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = os.path.getsize("final_model.int8.ptz") + code_bytes = len(code.encode("utf-8")) + ratio = quant_stats["baseline_tensor_bytes"] / max(quant_stats["int8_payload_bytes"], 1) + log0( + f"Serialized model int5/int6+zstd: {quant_file_bytes} bytes " + f"(payload:{quant_stats['int8_payload_bytes']} raw_torch:{quant_raw_bytes} payload_ratio:{ratio:.2f}x " + f"int5_tensors:{quant_stats['int5_tensors']} int6_tensors:{quant_stats['int6_tensors']})" + ) + log0(f"Total submission size int5/int6+zstd: {quant_file_bytes + code_bytes} bytes") + + if distributed: + dist.barrier() + with open("final_model.int8.ptz", "rb") as f: + quant_blob_disk = f.read() + dctx = zstd.ZstdDecompressor() + quant_state = torch.load(io.BytesIO(dctx.decompress(quant_blob_disk)), map_location="cpu") + base_model.load_state_dict(dequantize_state_dict_int8(quant_state), strict=True) + torch.cuda.synchronize() + + # Standard eval (for comparison) + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"final_int6_zstd_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_zstd_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + + # Sliding window eval (the competitive score) + torch.cuda.synchronize() + t_sw = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding_window( + args, + model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"final_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_sw):.0f}ms" + ) + log0(f"final_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + + # Full SGD test-time training (SOTA technique from PR #338) + if args.ttt_sgd_enabled: + torch._dynamo.reset() + ttt_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + smeargate_dim=args.smeargate_dim, + bigram_hash_buckets=args.bigram_hash_buckets, + bigram_embed_dim=args.bigram_embed_dim, + paired_head_layers=args.paired_head_layers, + xsa_last_n=args.xsa_last_n, + rope_frac=args.rope_frac, + backout_enabled=args.backout_enabled, + use_swiglu=args.use_swiglu, + value_residual=args.value_residual, + gated_attention=args.gated_attention, + ln_scale=args.ln_scale, + catalytic=args.catalytic, + ).to(device, dtype=torch.bfloat16) + ttt_model.load_state_dict(base_model.state_dict()) + torch.cuda.synchronize() + t_ttt = time.perf_counter() + ttt_val_loss, ttt_val_bpb = eval_val_ttt_sgd( + args, ttt_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"final_ttt_sgd val_loss:{ttt_val_loss:.4f} val_bpb:{ttt_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_ttt):.0f}ms" + ) + else: + log0("TTT SGD evaluation skipped (set TTT_SGD_ENABLED=1 to enable)") + + # LoRA test-time training evaluation (legacy, disabled by default) + if args.ttt_enabled and not args.ttt_sgd_enabled: + torch._dynamo.reset() + ttt_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + smeargate_dim=args.smeargate_dim, + bigram_hash_buckets=args.bigram_hash_buckets, + bigram_embed_dim=args.bigram_embed_dim, + paired_head_layers=args.paired_head_layers, + xsa_last_n=args.xsa_last_n, + rope_frac=args.rope_frac, + backout_enabled=args.backout_enabled, + use_swiglu=args.use_swiglu, + value_residual=args.value_residual, + gated_attention=args.gated_attention, + ln_scale=args.ln_scale, + catalytic=args.catalytic, + ).to(device, dtype=torch.bfloat16) + ttt_model.load_state_dict(base_model.state_dict()) + torch.cuda.synchronize() + t_ttt = time.perf_counter() + ttt_val_loss, ttt_val_bpb = eval_val_ttt_lora( + args, ttt_model, rank, world_size, device, + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"final_int8_ttt_lora val_loss:{ttt_val_loss:.4f} val_bpb:{ttt_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_ttt):.0f}ms" + ) + + if distributed: + dist.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/train_seed1337.log b/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/train_seed1337.log new file mode 100644 index 000000000..aa2968334 --- /dev/null +++ b/records/track_10min_16mb/2026-03-23_11L_UNet_Catalytic_SwiGLU_SW64/train_seed1337.log @@ -0,0 +1,130 @@ +W0323 05:13:04.712000 139622980711040 torch/distributed/run.py:779] +W0323 05:13:04.712000 139622980711040 torch/distributed/run.py:779] ***************************************** +W0323 05:13:04.712000 139622980711040 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0323 05:13:04.712000 139622980711040 torch/distributed/run.py:779] ***************************************** +logs/a5198072-8ece-4e87-975d-f2326a3dc452.txt +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +model_params:27105501 +world_size:8 grad_accum_steps:1 +sdp_backends:cudnn=False flash=True mem_efficient=False math=False +attention_mode:gqa num_heads:8 num_kv_heads:4 +tie_embeddings:True embed_lr:0.035 head_lr:0.0 matrix_lr:0.025 scalar_lr:0.025 +train_batch_tokens:524288 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 +seed:1337 +warmup_step:1/20 +warmup_step:2/20 +warmup_step:3/20 +warmup_step:4/20 +warmup_step:5/20 +warmup_step:6/20 +warmup_step:7/20 +warmup_step:8/20 +warmup_step:9/20 +warmup_step:10/20 +warmup_step:11/20 +warmup_step:12/20 +warmup_step:13/20 +warmup_step:14/20 +warmup_step:15/20 +warmup_step:16/20 +warmup_step:17/20 +warmup_step:18/20 +warmup_step:19/20 +warmup_step:20/20 +EMA enabled: decay=0.9985 +step:0/20000 val_loss:6.9362 val_bpb:4.1080 train_time:3ms step_avg:2.72ms +step:1/20000 train_loss:6.9357 train_time:147ms step_avg:146.81ms +step:2/20000 train_loss:13.0909 train_time:233ms step_avg:116.36ms +step:3/20000 train_loss:7.8237 train_time:319ms step_avg:106.45ms +step:4/20000 train_loss:6.5013 train_time:406ms step_avg:101.53ms +step:5/20000 train_loss:7.0011 train_time:493ms step_avg:98.59ms +step:6/20000 train_loss:7.9147 train_time:580ms step_avg:96.63ms +step:7/20000 train_loss:7.1354 train_time:667ms step_avg:95.28ms +step:8/20000 train_loss:6.7093 train_time:754ms step_avg:94.20ms +step:9/20000 train_loss:6.2791 train_time:840ms step_avg:93.37ms +step:10/20000 train_loss:6.0417 train_time:927ms step_avg:92.71ms +step:200/20000 train_loss:2.6336 train_time:17448ms step_avg:87.24ms +step:400/20000 train_loss:2.1826 train_time:34826ms step_avg:87.07ms +step:600/20000 train_loss:2.4187 train_time:52214ms step_avg:87.02ms +step:800/20000 train_loss:2.1891 train_time:69612ms step_avg:87.01ms +step:1000/20000 train_loss:2.3053 train_time:87016ms step_avg:87.02ms +step:1000/20000 val_loss:2.2518 val_bpb:1.3336 train_time:87019ms step_avg:87.02ms +step:1200/20000 train_loss:2.3316 train_time:104417ms step_avg:87.01ms +step:1400/20000 train_loss:2.3634 train_time:121816ms step_avg:87.01ms +step:1600/20000 train_loss:2.0404 train_time:139209ms step_avg:87.01ms +step:1800/20000 train_loss:2.1482 train_time:156603ms step_avg:87.00ms +step:2000/20000 train_loss:2.1835 train_time:173998ms step_avg:87.00ms +step:2000/20000 val_loss:2.1693 val_bpb:1.2848 train_time:174001ms step_avg:87.00ms +step:2200/20000 train_loss:2.0076 train_time:191393ms step_avg:87.00ms +step:2400/20000 train_loss:2.1351 train_time:208889ms step_avg:87.04ms +step:2600/20000 train_loss:2.3631 train_time:226280ms step_avg:87.03ms +step:2800/20000 train_loss:2.1769 train_time:243663ms step_avg:87.02ms +step:3000/20000 train_loss:2.1679 train_time:261054ms step_avg:87.02ms +step:3000/20000 val_loss:2.1336 val_bpb:1.2636 train_time:261057ms step_avg:87.02ms +step:3200/20000 train_loss:2.1331 train_time:278442ms step_avg:87.01ms +step:3400/20000 train_loss:2.1108 train_time:295832ms step_avg:87.01ms +step:3600/20000 train_loss:2.0508 train_time:313220ms step_avg:87.01ms +step:3800/20000 train_loss:2.1528 train_time:330611ms step_avg:87.00ms +step:4000/20000 train_loss:2.1247 train_time:347990ms step_avg:87.00ms +step:4000/20000 val_loss:2.1139 val_bpb:1.2520 train_time:347993ms step_avg:87.00ms +step:4200/20000 train_loss:2.1132 train_time:365447ms step_avg:87.01ms +step:4400/20000 train_loss:2.0426 train_time:382846ms step_avg:87.01ms +step:4600/20000 train_loss:1.9042 train_time:400235ms step_avg:87.01ms +step:4800/20000 train_loss:2.1804 train_time:417620ms step_avg:87.00ms +step:5000/20000 train_loss:1.9379 train_time:435007ms step_avg:87.00ms +step:5000/20000 val_loss:2.0713 val_bpb:1.2267 train_time:435010ms step_avg:87.00ms +step:5200/20000 train_loss:2.0827 train_time:452396ms step_avg:87.00ms +step:5400/20000 train_loss:2.0948 train_time:469781ms step_avg:87.00ms +step:5600/20000 train_loss:2.0705 train_time:487177ms step_avg:87.00ms +step:5800/20000 train_loss:2.0235 train_time:504556ms step_avg:86.99ms +step:6000/20000 train_loss:2.0949 train_time:521947ms step_avg:86.99ms +step:6000/20000 val_loss:2.0188 val_bpb:1.1956 train_time:521950ms step_avg:86.99ms +step:6200/20000 train_loss:1.9538 train_time:539339ms step_avg:86.99ms +step:6400/20000 train_loss:2.0170 train_time:556726ms step_avg:86.99ms +step:6600/20000 train_loss:1.9586 train_time:574109ms step_avg:86.99ms +step:6800/20000 train_loss:2.0165 train_time:591496ms step_avg:86.98ms +step:6898/20000 val_loss:1.9596 val_bpb:1.1606 train_time:600047ms step_avg:86.99ms +stopping_early: wallclock_cap train_time:600047ms step:6898/20000 +peak memory allocated: 18478 MiB reserved: 19972 MiB +EMA: loading exponential moving average weights (decay=0.9985) +Serialized model: 106387888 bytes +Code size: 96259 bytes +Total submission size: 106484147 bytes +Serialized model int5/int6+zstd: 15096450 bytes (payload:27910986 raw_torch:27990248 payload_ratio:3.81x int5_tensors:34 int6_tensors:44) +Total submission size int5/int6+zstd: 15192709 bytes +/root/parameter-golf/train_gpt.py:2007: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + quant_state = torch.load(io.BytesIO(dctx.decompress(quant_blob_disk)), map_location="cpu") +/root/parameter-golf/train_gpt.py:2007: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + quant_state = torch.load(io.BytesIO(dctx.decompress(quant_blob_disk)), map_location="cpu") +/root/parameter-golf/train_gpt.py:2007: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + quant_state = torch.load(io.BytesIO(dctx.decompress(quant_blob_disk)), map_location="cpu") +/root/parameter-golf/train_gpt.py:2007: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + quant_state = torch.load(io.BytesIO(dctx.decompress(quant_blob_disk)), map_location="cpu") +/root/parameter-golf/train_gpt.py:2007: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + quant_state = torch.load(io.BytesIO(dctx.decompress(quant_blob_disk)), map_location="cpu") +/root/parameter-golf/train_gpt.py:2007: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + quant_state = torch.load(io.BytesIO(dctx.decompress(quant_blob_disk)), map_location="cpu") +/root/parameter-golf/train_gpt.py:2007: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + quant_state = torch.load(io.BytesIO(dctx.decompress(quant_blob_disk)), map_location="cpu") +/root/parameter-golf/train_gpt.py:2007: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + quant_state = torch.load(io.BytesIO(dctx.decompress(quant_blob_disk)), map_location="cpu") +final_int6_zstd_roundtrip val_loss:1.9793 val_bpb:1.1723 eval_time:2744ms +final_int6_zstd_roundtrip_exact val_loss:1.97930593 val_bpb:1.17225646 +sliding_window_eval: 121136 windows, batch_size=16, 7571 batches +final_sliding_window val_loss:1.9516 val_bpb:1.1558 eval_time:171694ms +final_sliding_window_exact val_loss:1.95156679 val_bpb:1.15582778 +TTT AdamW: 178 trainable param groups, 25,991,376 params, freezing first 0 blocks +TTT AdamW epoch 1/10: loss=2.1951 steps=119 (bs=32) +TTT AdamW epoch 2/10: loss=2.0892 steps=119 (bs=32) +TTT AdamW epoch 3/10: loss=2.0579 steps=119 (bs=32) +TTT AdamW epoch 4/10: loss=2.0419 steps=119 (bs=32) +TTT AdamW epoch 5/10: loss=2.0329 steps=119 (bs=32) +TTT AdamW epoch 6/10: loss=2.0276 steps=119 (bs=32) +TTT AdamW epoch 7/10: loss=2.0248 steps=119 (bs=32) +TTT AdamW epoch 8/10: loss=2.0234 steps=119 (bs=32) +TTT AdamW epoch 9/10: loss=2.0227 steps=119 (bs=32) +TTT AdamW epoch 10/10: loss=2.0225 steps=119 (bs=32) +sliding_window_eval: 121136 windows, batch_size=16, 7571 batches +final_ttt_sgd val_loss:2.0130 val_bpb:1.1922 eval_time:462279ms