diff --git a/experiments/A_wing/green/run.sh b/experiments/A_wing/green/run.sh new file mode 100755 index 000000000..299cf47c3 --- /dev/null +++ b/experiments/A_wing/green/run.sh @@ -0,0 +1,56 @@ +#!/bin/bash +set -euo pipefail +# A-WING GREEN: INT5 GPTQ (clip_range=15 vs INT6 clip_range=31) +# Base: bwing_IV (9-prime fix + fixed mults + entropy shift) +# Theory: more quant noise → higher entropy → n-gram rescues harder (#809 uses INT5) + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +echo "============================================" +echo " A-WING GREEN — INT5 GPTQ + 9-Prime" +echo " Seed: ${SEED}" +echo " GPTQ INT5 (clip_range=15), 9 hash primes" +echo " Fixed mults + entropy shift, no cubric" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=0 \ +CUBRIC_CADENCE=0 \ +NGRAM_ENTROPY_SHIFT=1 \ +NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/awing_green_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/A_wing/green/train_gpt.py b/experiments/A_wing/green/train_gpt.py new file mode 100644 index 000000000..5753e10b8 --- /dev/null +++ b/experiments/A_wing/green/train_gpt.py @@ -0,0 +1,1936 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + ngram_order_mults_str = os.environ.get("NGRAM_ORDER_MULTS", "") # fixed per-order multipliers (comma-sep) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + # Parse fixed per-order multipliers (PR #809 style) + _fixed_order_mults = None + if args.ngram_order_mults_str: + _fixed_order_mults = np.array([float(x) for x in args.ngram_order_mults_str.split(",")], dtype=np.float64) + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017), np.uint64(283721), + np.uint64(347237)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + else: + per_token_alpha = np.full(seg_len, alpha) + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Mix where n-gram matched + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + if adaptive and args.ngram_entropy_shift: + matched_ords = _ng_ord[m_idx].astype(np.float64) + shifted_centers = ent_center - 0.25 * (matched_ords - float(min_order)) + shifted_sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy[m_idx] - shifted_centers))) + per_token_alpha[m_idx] = alpha_min + (alpha_max - alpha_min) * shifted_sig + if _fixed_order_mults is not None: + a = per_token_alpha[m_idx].copy() + mult_indices = _ng_ord[m_idx] - min_order + mult_indices = np.clip(mult_indices, 0, len(_fixed_order_mults) - 1) + a *= _fixed_order_mults[mult_indices] + np.clip(a, 0.0, 0.95, out=a) + else: + a = per_token_alpha[m_idx] + seg_model_p[m_idx] = (1.0 - a) * seg_model_p[m_idx] + a * p_ng[m_idx] + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 15) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 15, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 15) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + log0(f"model_params:{n_params}") + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int5+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int5+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int5_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int5_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int5_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int5_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int5_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int5_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int5_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int5_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/A_wing/green/train_seed1337.log b/experiments/A_wing/green/train_seed1337.log new file mode 100644 index 000000000..973946291 --- /dev/null +++ b/experiments/A_wing/green/train_seed1337.log @@ -0,0 +1,103 @@ +============================================ + A-WING GREEN — INT5 GPTQ + 9-Prime + Seed: 1337 + GPTQ INT5 (clip_range=15), 9 hash primes + Fixed mults + entropy shift, no cubric +============================================ +W0326 07:30:47.033000 2016 torch/distributed/run.py:803] +W0326 07:30:47.033000 2016 torch/distributed/run.py:803] ***************************************** +W0326 07:30:47.033000 2016 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0326 07:30:47.033000 2016 torch/distributed/run.py:803] ***************************************** +logs/dff55565-90ac-4982-824c-0cb07ccacd65.txt +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +complementary_training:alpha=0.5 +model_params:26928220 +f1_corr:rank=0 params=0 est_int6_bytes~0 +mlp_act:leaky_relu_sq mlp_leaky_slope:0.5 +XSA:last_4 world_size:8 grad_accum_steps:1 +num_heads:8 num_kv_heads:4 embed_lr:0.035 matrix_lr:0.025 +train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 +compile:enabled=1 fullgraph=0 +seed:1337 +ngram_eval:order=9 alpha=0.3 min_count=2 buckets=8388608 +warmup_step:1/20 +warmup_step:2/20 +warmup_step:3/20 +warmup_step:4/20 +warmup_step:5/20 +warmup_step:6/20 +warmup_step:7/20 +warmup_step:8/20 +warmup_step:9/20 +warmup_step:10/20 +warmup_step:11/20 +warmup_step:12/20 +warmup_step:13/20 +warmup_step:14/20 +warmup_step:15/20 +warmup_step:16/20 +warmup_step:17/20 +warmup_step:18/20 +warmup_step:19/20 +warmup_step:20/20 +step:0/20000 val_loss:6.9317 val_bpb:4.1054 train_time:0ms step_avg:0.02ms +step:1/20000 train_loss:6.9343 train_time:143ms step_avg:143.41ms +step:2/20000 train_loss:8.6212 train_time:226ms step_avg:113.04ms +step:3/20000 train_loss:7.8209 train_time:312ms step_avg:104.12ms +step:4/20000 train_loss:7.1064 train_time:398ms step_avg:99.50ms +step:5/20000 train_loss:6.8529 train_time:485ms step_avg:96.90ms +step:6/20000 train_loss:6.7961 train_time:570ms step_avg:94.93ms +step:7/20000 train_loss:6.6784 train_time:656ms step_avg:93.68ms +step:8/20000 train_loss:6.5596 train_time:742ms step_avg:92.71ms +step:9/20000 train_loss:6.2552 train_time:827ms step_avg:91.94ms +step:10/20000 train_loss:5.9363 train_time:913ms step_avg:91.32ms +step:1000/20000 train_loss:2.2345 train_time:87847ms step_avg:87.85ms +step:2000/20000 train_loss:2.0285 train_time:175893ms step_avg:87.95ms +step:3000/20000 train_loss:2.1264 train_time:263985ms step_avg:87.99ms +step:4000/20000 train_loss:1.9367 train_time:352016ms step_avg:88.00ms +step:5000/20000 train_loss:2.0641 train_time:440120ms step_avg:88.02ms +late_qat:enabled step:5067 scale:0.4999 +step:6000/20000 train_loss:1.9070 train_time:528137ms step_avg:88.02ms +swa:start step:6200 +step:6814/20000 val_loss:1.9225 val_bpb:1.1386 train_time:600027ms step_avg:88.06ms +stopping_early: wallclock_cap train_time:600027ms step:6814/20000 +peak memory allocated: 20677 MiB reserved: 20716 MiB +gptq:calibrating with training data... +gptq:calibrated 68 layers in 3.4s +ema:applying EMA weights +DIAGNOSTIC post_ema val_loss:1.9208 val_bpb:1.1376 eval_time:2240ms +Serialized model: 106047497 bytes +Code size: 106202 bytes +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +Serialized model int5+zlib: 13666914 bytes +Total submission size int5+zlib: 13773116 bytes +Total submission size int8+zlib: 13773116 bytes +final_int5_roundtrip val_loss:1.9689 val_bpb:1.1661 eval_time:37008ms +final_int5_roundtrip_exact val_loss:1.96888819 val_bpb:1.16608649 +final_int5_sliding_window val_loss:1.9264 val_bpb:1.1410 stride:64 eval_time:96465ms +final_int5_sliding_window_exact val_loss:1.92644292 val_bpb:1.14095103 +final_int8_zlib_roundtrip_exact val_loss:1.92644292 val_bpb:1.14095103 +ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True +ngram_eval:chunk [1/60] bpb=1.152801 t=15s +ngram_eval:chunk [2/60] bpb=1.232931 t=18s +ngram_eval:chunk [3/60] bpb=1.257240 t=21s +ngram_eval:chunk [11/60] bpb=1.168507 t=43s +ngram_eval:chunk [21/60] bpb=0.891224 t=69s +ngram_eval:chunk [31/60] bpb=0.705693 t=95s +ngram_eval:chunk [41/60] bpb=0.584660 t=119s +ngram_eval:chunk [51/60] bpb=0.505440 t=144s +ngram_eval:chunk [60/60] bpb=0.457581 t=176s +final_int5_sliding_window_ngram9 val_loss:0.7726 val_bpb:0.4576 eval_time:176713ms +final_int5_sliding_window_ngram9_exact val_loss:0.77264878 val_bpb:0.45760734 +============================================ + DONE +============================================ diff --git a/experiments/A_wing/green_1/run.sh b/experiments/A_wing/green_1/run.sh new file mode 100755 index 000000000..fea8957c4 --- /dev/null +++ b/experiments/A_wing/green_1/run.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -euo pipefail +# A-WING GREEN_1: Oracle Alpha + 9-Prime Hash Fix +# Instead of entropy-adaptive alpha, directly compare model_p vs ngram_p +# per token. Soft sigmoid on log-ratio (steepness=8), clip 0.95. +# Base: SOTA bwing_full_port (0.4512 BPB) + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +# --- Pre-flight checks --- +echo "[preflight] checking zstandard..." +python3 -c "import zstandard; print(f' zstandard {zstandard.__version__} OK')" 2>/dev/null \ + || { echo " FATAL: zstandard not found. pip install zstandard"; exit 1; } + +echo "[preflight] checking flash_attn..." +python3 -c " +try: + import flash_attn_interface; print(' FA3 (hopper) OK') +except ImportError: + import flash_attn; v=flash_attn.__version__ + if v.startswith('3'): print(f' FA3 v{v} OK') + else: print(f' WARNING: FA{v[0]} detected — want FA3') +" 2>/dev/null || echo " WARNING: no flash_attn found" + +echo "============================================" +echo " A-WING GREEN_1 — Oracle Alpha + 9-Prime" +echo " Seed: ${SEED}" +echo " Oracle: alpha = sigmoid(8 * log(ngram_p/model_p)) * 0.95" +echo " 9 hash primes, INT6, no cubric" +echo " Training cap: 570s (30s reserved for GPTQ)" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=0 \ +CUBRIC_CADENCE=0 \ +NGRAM_ENTROPY_SHIFT=1 \ +NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \ +MAX_WALLCLOCK_SECONDS=570 \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/awing_green1_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/A_wing/green_1/train_gpt.py b/experiments/A_wing/green_1/train_gpt.py new file mode 100644 index 000000000..fdd2e23dc --- /dev/null +++ b/experiments/A_wing/green_1/train_gpt.py @@ -0,0 +1,2114 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + import warnings + warnings.warn("zstandard not found — falling back to zlib. Artifact will be ~1.5MB larger! pip install zstandard") + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + ngram_order_mults_str = os.environ.get("NGRAM_ORDER_MULTS", "") # fixed per-order multipliers (comma-sep) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + # Parse fixed per-order multipliers (PR #809 style) + _fixed_order_mults = None + if args.ngram_order_mults_str: + _fixed_order_mults = np.array([float(x) for x in args.ngram_order_mults_str.split(",")], dtype=np.float64) + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017), np.uint64(283721), + np.uint64(347237)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Oracle alpha: use actual model_p vs ngram_p comparison + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + mp = seg_model_p[m_idx] + np_val = p_ng[m_idx] + # Soft oracle: sigmoid on log-ratio, steepness=8 + log_ratio = np.log(np.maximum(np_val, 1e-12)) - np.log(np.maximum(mp, 1e-12)) + a = 0.95 / (1.0 + np.exp(-8.0 * log_ratio)) + seg_model_p[m_idx] = (1.0 - a) * mp + a * np_val + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/A_wing/green_2/run.sh b/experiments/A_wing/green_2/run.sh new file mode 100755 index 000000000..d7df475cc --- /dev/null +++ b/experiments/A_wing/green_2/run.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -euo pipefail +# A-WING GREEN_2: Oracle Alpha + 9-Prime + LoRA TTT +# Oracle alpha (model_p vs ngram_p) + LoRA TTT adaptation before n-gram eval +# TTT adapts Q/V projections with rank-8 LoRA on already-scored val tokens + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +# --- Pre-flight checks --- +echo "[preflight] checking zstandard..." +python3 -c "import zstandard; print(f' zstandard {zstandard.__version__} OK')" 2>/dev/null \ + || { echo " FATAL: zstandard not found. pip install zstandard"; exit 1; } + +echo "[preflight] checking flash_attn..." +python3 -c " +try: + import flash_attn_interface; print(' FA3 (hopper) OK') +except ImportError: + import flash_attn; v=flash_attn.__version__ + if v.startswith('3'): print(f' FA3 v{v} OK') + else: print(f' WARNING: FA{v[0]} detected — want FA3') +" 2>/dev/null || echo " WARNING: no flash_attn found" + +echo "============================================" +echo " A-WING GREEN_2 — Oracle Alpha + TTT + 9-Prime" +echo " Seed: ${SEED}" +echo " Oracle alpha + LoRA TTT (rank 8, AdamW)" +echo " Training cap: 570s (30s reserved for GPTQ)" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=1 \ +TTT_LORA_RANK=8 \ +TTT_LR=3e-4 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=0 \ +CUBRIC_CADENCE=0 \ +NGRAM_ENTROPY_SHIFT=1 \ +NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \ +MAX_WALLCLOCK_SECONDS=570 \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/awing_green2_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/A_wing/green_2/train_gpt.py b/experiments/A_wing/green_2/train_gpt.py new file mode 100644 index 000000000..48720e00a --- /dev/null +++ b/experiments/A_wing/green_2/train_gpt.py @@ -0,0 +1,2220 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + import warnings + warnings.warn("zstandard not found — falling back to zlib. Artifact will be ~1.5MB larger! pip install zstandard") + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + ngram_order_mults_str = os.environ.get("NGRAM_ORDER_MULTS", "") # fixed per-order multipliers (comma-sep) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def eval_ttt_lora( + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + *, + lora_rank: int = 8, + lr: float = 3e-4, + weight_decay: float = 0.0, + seq_len: int = 2048, + stride: int = 64, + polyak_decay: float = 0.998, +) -> nn.Module: + """Score-first LoRA TTT: adapt Q/V projections on already-evaluated val tokens. + Returns the adapted model (base weights frozen, LoRA deltas active).""" + total_tokens = val_tokens.numel() - 1 + model.eval() + for p in model.parameters(): + p.requires_grad_(False) + # Inject LoRA adapters into Q and V projections + lora_params = [] + lora_modules = [] + for block in model.blocks: + attn = block.attn + for proj_name in ("c_q", "c_v"): + base = getattr(attn, proj_name) + in_f = base.weight.shape[1] + out_f = base.weight.shape[0] + lora_A = nn.Parameter(torch.randn(in_f, lora_rank, device=device, dtype=torch.float32) * 0.01) + lora_B = nn.Parameter(torch.zeros(lora_rank, out_f, device=device, dtype=torch.float32)) + lora_params.extend([lora_A, lora_B]) + lora_modules.append((attn, proj_name, base, lora_A, lora_B)) + # Monkey-patch forward to include LoRA delta + orig_forwards = {} + for attn, proj_name, base, lora_A, lora_B in lora_modules: + orig_forward = base.forward + orig_forwards[(id(attn), proj_name)] = orig_forward + def make_lora_forward(orig_fn, A, B): + def lora_forward(x): + return orig_fn(x) + (x.float() @ A @ B).to(x.dtype) + return lora_forward + base.forward = make_lora_forward(orig_forward, lora_A, lora_B) + # Polyak-averaged copies + polyak_state = [p.detach().clone() for p in lora_params] + optimizer = torch.optim.AdamW(lora_params, lr=lr, weight_decay=weight_decay) + # Score-first TTT: slide over val tokens, score then adapt + t0 = time.perf_counter() + steps = 0 + for ws in range(0, total_tokens, stride * 16): # coarse stride for speed + end = min(ws + seq_len, total_tokens) + wlen = end - ws + if wlen < 2: + continue + x = val_tokens[ws:end].unsqueeze(0).to(device=device, dtype=torch.int64) + y = val_tokens[ws + 1:end + 1].unsqueeze(0).to(device=device, dtype=torch.int64) + optimizer.zero_grad() + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = model.forward_logits(x) + loss = F.cross_entropy(logits.float().reshape(-1, logits.size(-1)), y.reshape(-1)) + loss.backward() + optimizer.step() + # Polyak average + with torch.no_grad(): + for i, p in enumerate(lora_params): + polyak_state[i].mul_(polyak_decay).add_(p.data, alpha=1.0 - polyak_decay) + steps += 1 + # Apply Polyak-averaged weights + with torch.no_grad(): + for i, p in enumerate(lora_params): + p.data.copy_(polyak_state[i]) + elapsed = time.perf_counter() - t0 + if rank == 0: + print(f"ttt_lora:done steps={steps} rank={lora_rank} lr={lr} " + f"polyak={polyak_decay} time={elapsed:.1f}s", flush=True) + model.eval() + return model + + +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + # Parse fixed per-order multipliers (PR #809 style) + _fixed_order_mults = None + if args.ngram_order_mults_str: + _fixed_order_mults = np.array([float(x) for x in args.ngram_order_mults_str.split(",")], dtype=np.float64) + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017), np.uint64(283721), + np.uint64(347237)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Oracle alpha: use actual model_p vs ngram_p comparison + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + mp = seg_model_p[m_idx] + np_val = p_ng[m_idx] + # Soft oracle: sigmoid on log-ratio, steepness=8 + log_ratio = np.log(np.maximum(np_val, 1e-12)) - np.log(np.maximum(mp, 1e-12)) + a = 0.95 / (1.0 + np.exp(-8.0 * log_ratio)) + seg_model_p[m_idx] = (1.0 - a) * mp + a * np_val + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + # TTT: adapt model with LoRA before n-gram eval + ttt_enabled = bool(int(os.environ.get("TTT_EVAL_ENABLED", "0"))) + if ttt_enabled: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ttt = time.perf_counter() + eval_model = eval_ttt_lora( + eval_model, rank, world_size, device, val_tokens, + lora_rank=int(os.environ.get("TTT_LORA_RANK", "8")), + lr=float(os.environ.get("TTT_LR", "3e-4")), + seq_len=sw_seq_len, + stride=args.eval_stride, + ) + torch.cuda.synchronize() + ttt_ms = 1000.0 * (time.perf_counter() - t_ttt) + # Measure TTT-adapted model BPB + ttt_loss, ttt_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, eval_seq_len=sw_seq_len, + ) + log0(f"final_ttt_sliding val_loss:{ttt_loss:.4f} val_bpb:{ttt_bpb:.4f} " + f"ttt_time:{ttt_ms:.0f}ms") + log0(f"final_ttt_sliding_exact val_loss:{ttt_loss:.8f} val_bpb:{ttt_bpb:.8f}") + if distributed: + dist.barrier() + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/A_wing/purple/run.sh b/experiments/A_wing/purple/run.sh new file mode 100755 index 000000000..3e917da2d --- /dev/null +++ b/experiments/A_wing/purple/run.sh @@ -0,0 +1,81 @@ +#!/bin/bash +set -euo pipefail +# A-WING PURPLE: Learned Mixer Head — Legal N-gram Ceiling Finder +# Trains a Linear(512→12) head to predict per-token expert weights +# (neural + 11 n-gram orders 2-12). Training oracle prefilled from +# training data. Eval uses backward-looking val-data cache. +# Base: Green_1 SOTA 0.3200 BPB (neural 1.1195) + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +# --- Pre-flight checks --- +echo "[preflight] checking zstandard..." +python3 -c "import zstandard; print(f' zstandard {zstandard.__version__} OK')" 2>/dev/null \ + || { echo " FATAL: zstandard not found. pip install zstandard"; exit 1; } + +echo "[preflight] checking flash_attn..." +python3 -c " +try: + import flash_attn_interface; print(' FA3 (hopper) OK') +except ImportError: + import flash_attn; v=flash_attn.__version__ + if v.startswith('3'): print(f' FA3 v{v} OK') + else: print(f' WARNING: FA{v[0]} detected — want FA3') +" 2>/dev/null || echo " WARNING: no flash_attn found" + +echo "============================================" +echo " A-WING PURPLE — Learned Mixer Head" +echo " Seed: ${SEED}" +echo " Mixer: Linear(512→12), 11 n-gram orders 2-12" +echo " 12 hash primes, INT6, no cubric" +echo " Training cap: 540s (60s reserved for GPTQ + eval)" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +MIXER_ENABLED=1 \ +MIXER_N_ORDERS=11 \ +MIXER_LOSS_WEIGHT=0.1 \ +MIXER_NEURAL_FLOOR=0.05 \ +MIXER_BUCKETS=8388608 \ +MIXER_PREFILL_MAX_SHARDS=80 \ +NGRAM_EVAL_ORDER=12 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=0 \ +CUBRIC_CADENCE=0 \ +NGRAM_ENTROPY_SHIFT=1 \ +NGRAM_ORDER_MULTS="" \ +MAX_WALLCLOCK_SECONDS=540 \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/awing_purple_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/A_wing/purple/train_gpt.py b/experiments/A_wing/purple/train_gpt.py new file mode 100644 index 000000000..001c00dca --- /dev/null +++ b/experiments/A_wing/purple/train_gpt.py @@ -0,0 +1,2365 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + import warnings + warnings.warn("zstandard not found — falling back to zlib. Artifact will be ~1.5MB larger! pip install zstandard") + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + ngram_order_mults_str = os.environ.get("NGRAM_ORDER_MULTS", "") # fixed per-order multipliers (comma-sep) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + # Learned mixer head: train a tiny linear head to predict per-token expert weights + mixer_enabled = bool(int(os.environ.get("MIXER_ENABLED", "0"))) + mixer_n_orders = int(os.environ.get("MIXER_N_ORDERS", 11)) # n-gram orders 2..12 + mixer_loss_weight = float(os.environ.get("MIXER_LOSS_WEIGHT", 0.1)) + mixer_neural_floor = float(os.environ.get("MIXER_NEURAL_FLOOR", 0.05)) + mixer_buckets = int(os.environ.get("MIXER_BUCKETS", 8_388_608)) # 8M for training oracle + mixer_prefill_max_shards = int(os.environ.get("MIXER_PREFILL_MAX_SHARDS", 80)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +# 12 primes for XOR hashing — shared between training oracle and eval tables +NGRAM_PRIMES = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017), np.uint64(283721), + np.uint64(347237), np.uint64(401519), np.uint64(479909), np.uint64(541267)], + dtype=np.uint64, +) + +class TrainNgramOracle: + """Training-time n-gram oracle: prefilled from training data, frozen during training. + Used to supervise the learned mixer head — NOT used at eval time.""" + def __init__(self, buckets: int, min_order: int = 2, max_order: int = 12, min_count: int = 2): + self.buckets = buckets + self.min_order = min_order + self.max_order = max_order + self.min_count = min_count + self.mask = np.uint64(buckets - 1) + self.primes = NGRAM_PRIMES + self.n_orders = max_order - min_order + 1 + self.ctx_tables = {n: np.zeros(buckets, dtype=np.uint32) for n in range(min_order, max_order + 1)} + self.full_tables = {n: np.zeros(buckets, dtype=np.uint32) for n in range(min_order, max_order + 1)} + self.total_tokens = 0 + + def prefill_shard(self, filepath: str) -> int: + """Load a training shard and update hash tables. Returns token count.""" + raw = np.fromfile(filepath, dtype=np.uint16) + t = raw.astype(np.uint64) + n = len(t) + self.total_tokens += n + for order in range(self.min_order, self.max_order + 1): + if n < order: + continue + ctx_width = order - 1 + length = n - order + 1 + ctx_hash = np.zeros(length, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:k + length] * self.primes[k % len(self.primes)] + ctx_key = (ctx_hash & self.mask).astype(np.int64) + tgt = t[order - 1:order - 1 + length] + full_key = ((ctx_hash ^ (tgt * self.primes[ctx_width % len(self.primes)])) & self.mask).astype(np.int64) + self.ctx_tables[order] += np.bincount(ctx_key, minlength=self.buckets).astype(np.uint32) + self.full_tables[order] += np.bincount(full_key, minlength=self.buckets).astype(np.uint32) + return n + + def get_ngram_probs(self, x_batch: Tensor, y_batch: Tensor) -> tuple[Tensor, Tensor]: + """Get per-order n-gram probabilities for a training batch. + Returns (order_p, order_valid) both shaped (bsz, seq_len, n_orders). + order_p[..., i] is probability from order (min_order+i). + order_valid[..., i] is True where ctx_count >= min_count.""" + x_np = x_batch.cpu().numpy().astype(np.uint64) + y_np = y_batch.cpu().numpy().astype(np.uint64) + bsz, slen = x_np.shape + order_p = np.full((bsz, slen, self.n_orders), 1.0 / 1024.0, dtype=np.float32) + order_valid = np.zeros((bsz, slen, self.n_orders), dtype=np.bool_) + for oi, order in enumerate(range(self.min_order, self.max_order + 1)): + ctx_width = order - 1 + if slen < ctx_width: + continue + # Build context hash from x_batch (context tokens) + # For order n, context is x[pos-cw+1:pos+1], target is y[pos] + # x_batch[b, j] is input at position j, y_batch[b, j] is target at position j + # Context for position j: tokens at positions j-cw+1 .. j (= x[j-cw+1], ..., x[j]) + # But x_batch is the input sequence, where x[j] predicts y[j] + # For n-gram: we need the last (order-1) input tokens as context, and y[j] as target + ctx_hash = np.zeros((bsz, slen), dtype=np.uint64) + for k in range(ctx_width): + shift = ctx_width - 1 - k + if shift > 0: + ctx_hash[:, shift:] ^= x_np[:, :slen - shift] * self.primes[k % len(self.primes)] + else: + ctx_hash ^= x_np * self.primes[k % len(self.primes)] + ctx_key = (ctx_hash & self.mask).astype(np.int64) + full_key = ((ctx_hash ^ (y_np * self.primes[ctx_width % len(self.primes)])) & self.mask).astype(np.int64) + ctx_c = self.ctx_tables[order][ctx_key.ravel()].astype(np.float32).reshape(bsz, slen) + full_c = self.full_tables[order][full_key.ravel()].astype(np.float32).reshape(bsz, slen) + p = np.minimum(full_c, ctx_c) / np.maximum(ctx_c, 1.0) + p = np.clip(p, 0.0, 1.0) + valid = ctx_c >= self.min_count + if ctx_width > 0: + valid[:, :ctx_width] = False + order_p[:, :, oi] = np.where(valid, p, order_p[:, :, oi]) + order_valid[:, :, oi] = valid + return ( + torch.from_numpy(order_p), + torch.from_numpy(order_valid), + ) + +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + mixer_n_experts: int = 0, + mixer_loss_weight: float = 0.1, + mixer_neural_floor: float = 0.05, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + # Learned mixer head: predicts per-token expert weights for n-gram blending + self.mixer_n_experts = mixer_n_experts + self.mixer_loss_weight = mixer_loss_weight + self.mixer_neural_floor = mixer_neural_floor + if mixer_n_experts > 0: + self.alpha_head = nn.Linear(model_dim, mixer_n_experts, bias=True) + else: + self.alpha_head = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + # Special init for alpha_head: zeros + bias[0]=2.0 (favor neural initially) + if self.alpha_head is not None: + nn.init.zeros_(self.alpha_head.weight) + nn.init.zeros_(self.alpha_head.bias) + with torch.no_grad(): + self.alpha_head.bias[0] = 2.0 + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor, + ngram_expert_p: Tensor | None = None, ngram_valid_mask: Tensor | None = None) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + # Mixer loss: train alpha_head to blend neural + n-gram experts + if (self.training and self.alpha_head is not None and self.mixer_loss_weight > 0 + and ngram_expert_p is not None and ngram_valid_mask is not None): + alpha_raw = self.alpha_head(x_flat.float()) # (N, n_experts) + # Neural probability for the correct target token + with torch.no_grad(): + neural_p = F.softmax(logits.float(), dim=-1).gather(1, targets.unsqueeze(1)).squeeze(1) + # Stack experts: [neural, order2, order3, ..., orderN] + ngram_p_flat = ngram_expert_p.reshape(-1, ngram_expert_p.size(-1)) # (N, n_orders) + ngram_v_flat = ngram_valid_mask.reshape(-1, ngram_valid_mask.size(-1)) # (N, n_orders) + expert_p = torch.cat([neural_p.unsqueeze(1), ngram_p_flat.to(dtype=neural_p.dtype)], dim=1) + full_mask = torch.cat([ + torch.ones(targets.size(0), 1, device=targets.device, dtype=torch.bool), + ngram_v_flat.to(device=targets.device), + ], dim=1) + gate = alpha_raw.masked_fill(~full_mask, -1e9) + weights = F.softmax(gate, dim=-1) + # Neural floor: ensure ≥ mixer_neural_floor for neural expert + nf = self.mixer_neural_floor + neural_w = nf + (1.0 - nf) * weights[:, :1] + other_w = (1.0 - nf) * weights[:, 1:] + weights = torch.cat([neural_w, other_w], dim=1) + mixed_p = (weights * expert_p.clamp(min=1e-12)).sum(dim=1) + mixer_loss = -torch.log(mixed_p.clamp(min=1e-12)).mean() + main_loss = main_loss + self.mixer_loss_weight * mixer_loss + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + def forward_logits_and_alpha(self, input_ids: Tensor) -> tuple[Tensor, Tensor | None]: + """Return (logits, alpha_raw) — alpha_raw is gate logits for mixer head.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + alpha_raw = self.alpha_head(x.float()) if self.alpha_head is not None else None + return logits, alpha_raw +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + # Parse fixed per-order multipliers (PR #809 style) + _fixed_order_mults = None + if args.ngram_order_mults_str: + _fixed_order_mults = np.array([float(x) for x in args.ngram_order_mults_str.split(",")], dtype=np.float64) + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = NGRAM_PRIMES + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + _use_learned_alpha = (hasattr(base_model, 'alpha_head') and base_model.alpha_head is not None) + if _use_learned_alpha: + _compiled_la = maybe_torch_compile(base_model.forward_logits_and_alpha, args) + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + if _use_learned_alpha: + logits, alpha_raw_batch = _compiled_la(x_batch) + else: + logits = compiled_logits(x_batch) + alpha_raw_batch = None + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if not _use_learned_alpha and adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + elif not _use_learned_alpha: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + tgt_np = val_np[global_j].astype(np.uint64) + + if _use_learned_alpha: + # Learned mixer: get per-order probs and blend with learned weights + n_orders = max_order - min_order + 1 + order_p = np.full((seg_len, n_orders), 1.0 / 1024.0, dtype=np.float64) + order_valid = np.zeros((seg_len, n_orders), dtype=np.bool_) + for oi, n in enumerate(range(min_order, max_order + 1)): + ctx_width = n - 1 + valid = global_j >= ctx_width + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_c = ctx_tables[n][ctx_key].astype(np.float64) + full_c = full_tables[n][full_key].astype(np.float64) + has_data = ctx_c >= float(min_count) + if has_data.any(): + p = np.minimum(full_c[has_data], ctx_c[has_data]) / np.maximum(ctx_c[has_data], 1.0) + hit_idx = v_idx[has_data] + order_p[hit_idx, oi] = np.clip(p, 0.0, 1.0) + order_valid[hit_idx, oi] = True + # Build expert_p: [neural_p, order2_p, ..., orderN_p] + expert_p = np.concatenate([seg_model_p[:, None], order_p], axis=1) # (seg_len, 1+n_orders) + # Get learned alpha weights for this segment + seg_alpha = alpha_raw_batch[i, s:wlen].float().cpu().numpy() # (seg_len, n_experts) + # Masked softmax + full_mask = np.concatenate([ + np.ones((seg_len, 1), dtype=np.bool_), + order_valid, + ], axis=1) + seg_alpha_masked = np.where(full_mask, seg_alpha, -1e9) + # Softmax + seg_alpha_masked -= seg_alpha_masked.max(axis=1, keepdims=True) + exp_a = np.exp(seg_alpha_masked) + weights = exp_a / exp_a.sum(axis=1, keepdims=True) + # Neural floor + nf = getattr(base_model, 'mixer_neural_floor', 0.05) + weights[:, 0] = nf + (1.0 - nf) * weights[:, 0] + weights[:, 1:] = (1.0 - nf) * weights[:, 1:] + # Renormalize + weights /= weights.sum(axis=1, keepdims=True) + # Blend + seg_model_p = np.clip((weights * expert_p).sum(axis=1), 1e-12, 1.0) + else: + # Original backoff: highest matching order wins + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + # Oracle alpha: use actual model_p vs ngram_p comparison + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + mp = seg_model_p[m_idx] + np_val = p_ng[m_idx] + log_ratio = np.log(np.maximum(np_val, 1e-12)) - np.log(np.maximum(mp, 1e-12)) + a = 0.95 / (1.0 + np.exp(-8.0 * log_ratio)) + seg_model_p[m_idx] = (1.0 - a) * mp + a * np_val + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + mixer_n_experts = (1 + args.mixer_n_orders) if args.mixer_enabled else 0 + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + mixer_n_experts=mixer_n_experts, + mixer_loss_weight=args.mixer_loss_weight, + mixer_neural_floor=args.mixer_neural_floor, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + # Learned mixer: prefill training-data n-gram oracle + train_mixer: TrainNgramOracle | None = None + if args.mixer_enabled: + mixer_max_order = args.ngram_eval_min_order + args.mixer_n_orders - 1 + train_mixer = TrainNgramOracle( + buckets=args.mixer_buckets, + min_order=args.ngram_eval_min_order, + max_order=mixer_max_order, + min_count=args.ngram_eval_min_count, + ) + train_files = sorted(glob.glob(args.train_files))[:args.mixer_prefill_max_shards] + log0(f"mixer:prefilling from {len(train_files)} shards, orders {args.ngram_eval_min_order}..{mixer_max_order}...") + t_prefill = time.perf_counter() + for fi, f in enumerate(train_files): + train_mixer.prefill_shard(f) + if rank == 0 and (fi + 1) % 20 == 0: + print(f" mixer:prefill {fi+1}/{len(train_files)} shards, {train_mixer.total_tokens:,} tokens", flush=True) + prefill_s = time.perf_counter() - t_prefill + log0(f"mixer:prefilled {train_mixer.total_tokens:,} tokens in {prefill_s:.1f}s") + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + if base_model.alpha_head is not None: + scalar_params.extend(list(base_model.alpha_head.parameters())) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + # Mixer: get n-gram probs from training oracle (CPU, outside compiled model) + _mx_p, _mx_v = None, None + if train_mixer is not None: + _mx_p_cpu, _mx_v_cpu = train_mixer.get_ngram_probs(x, y) + _mx_p = _mx_p_cpu.to(device=device, dtype=torch.bfloat16) + _mx_v = _mx_v_cpu.to(device=device) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y, ngram_expert_p=_mx_p, ngram_valid_mask=_mx_v) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + mixer_n_experts=mixer_n_experts, mixer_neural_floor=args.mixer_neural_floor, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/A_wing/red/run.sh b/experiments/A_wing/red/run.sh new file mode 100755 index 000000000..d607d9cb0 --- /dev/null +++ b/experiments/A_wing/red/run.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -euo pipefail +# A-WING RED: Oracle Alpha + 9-Prime Hash Fix +# Instead of entropy-adaptive alpha, directly compare model_p vs ngram_p +# per token. Soft sigmoid on log-ratio (steepness=8), clip 0.95. +# Base: SOTA bwing_full_port (0.4512 BPB) + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +# --- Pre-flight checks --- +echo "[preflight] checking zstandard..." +python3 -c "import zstandard; print(f' zstandard {zstandard.__version__} OK')" 2>/dev/null \ + || { echo " FATAL: zstandard not found. pip install zstandard"; exit 1; } + +echo "[preflight] checking flash_attn..." +python3 -c " +try: + import flash_attn_interface; print(' FA3 (hopper) OK') +except ImportError: + import flash_attn; v=flash_attn.__version__ + if v.startswith('3'): print(f' FA3 v{v} OK') + else: print(f' WARNING: FA{v[0]} detected — want FA3') +" 2>/dev/null || echo " WARNING: no flash_attn found" + +echo "============================================" +echo " A-WING RED — Oracle Alpha + 9-Prime" +echo " Seed: ${SEED}" +echo " Oracle: alpha = sigmoid(8 * log(ngram_p/model_p)) * 0.95" +echo " 9 hash primes, INT6, no cubric" +echo " Training cap: 570s (30s reserved for GPTQ)" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=0 \ +CUBRIC_CADENCE=0 \ +NGRAM_ENTROPY_SHIFT=1 \ +NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \ +MAX_WALLCLOCK_SECONDS=570 \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/awing_red_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/A_wing/red/train_gpt.py b/experiments/A_wing/red/train_gpt.py new file mode 100644 index 000000000..fdd2e23dc --- /dev/null +++ b/experiments/A_wing/red/train_gpt.py @@ -0,0 +1,2114 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + import warnings + warnings.warn("zstandard not found — falling back to zlib. Artifact will be ~1.5MB larger! pip install zstandard") + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + ngram_order_mults_str = os.environ.get("NGRAM_ORDER_MULTS", "") # fixed per-order multipliers (comma-sep) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + # Parse fixed per-order multipliers (PR #809 style) + _fixed_order_mults = None + if args.ngram_order_mults_str: + _fixed_order_mults = np.array([float(x) for x in args.ngram_order_mults_str.split(",")], dtype=np.float64) + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017), np.uint64(283721), + np.uint64(347237)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Oracle alpha: use actual model_p vs ngram_p comparison + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + mp = seg_model_p[m_idx] + np_val = p_ng[m_idx] + # Soft oracle: sigmoid on log-ratio, steepness=8 + log_ratio = np.log(np.maximum(np_val, 1e-12)) - np.log(np.maximum(mp, 1e-12)) + a = 0.95 / (1.0 + np.exp(-8.0 * log_ratio)) + seg_model_p[m_idx] = (1.0 - a) * mp + a * np_val + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/B_wing/bwing_II/HYPOTHESIS.md b/experiments/B_wing/bwing_II/HYPOTHESIS.md new file mode 100644 index 000000000..24eedfaf3 --- /dev/null +++ b/experiments/B_wing/bwing_II/HYPOTHESIS.md @@ -0,0 +1,23 @@ +# B-WING II — Cubric + Entropy Shift + Fast TTT + +## Hypothesis +Stack everything: +1. Cubric 3D ON with warm-start (our edge — per entropy×count adaptation) +2. Per-order entropy shift from #809 (-0.25 per order above min) +3. Alpha 0.05-0.60, clip 0.95 from #809 +4. Our sliding-window TTT (score-first, SGD, 1 epoch for speed) + +TTT adapts the model BEFORE n-gram eval runs. The n-gram cache +then operates on improved model probabilities. + +## Changes from bwing_full_port +- CUBRIC_CADENCE=32 (was 0 — cubric back ON) +- NGRAM_ORDER_MULTS removed (cubric handles per-order scaling) +- TTT_ENABLED=1 (fast: 1 epoch, freeze 2 blocks, SGD+momentum) +- NGRAM_EVAL_MAX_SECONDS=0 (no time limit on n-gram eval) + +## Expected timing +- Training: ~600s +- TTT: ~30-60s (1 epoch, fast SGD) +- N-gram: ~180s +- Total eval: ~250-300s (within 600s budget) diff --git a/experiments/B_wing/bwing_II/run.sh b/experiments/B_wing/bwing_II/run.sh new file mode 100644 index 000000000..9a0309cb4 --- /dev/null +++ b/experiments/B_wing/bwing_II/run.sh @@ -0,0 +1,62 @@ +#!/bin/bash +set -euo pipefail +# B-WING II: Cubric ON + entropy shift + alpha fix + fast TTT +# Best of both worlds: our cubric 3D + #809 entropy/alpha + our sliding TTT + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +echo "============================================" +echo " B-WING II — Cubric + Entropy Shift + TTT" +echo " Seed: ${SEED}" +echo " Cubric 3D ON + entropy shift + clip 0.95" +echo " Fast TTT: 1 epoch, SGD, freeze 2 blocks" +echo " Eval alpha: 0.05-0.60 clip=0.95 | Orders: 2-9" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=0 \ +NGRAM_ENTROPY_SHIFT=1 \ +CUBRIC_CADENCE=32 \ +TTT_ENABLED=1 \ +TTT_LR=0.002 \ +TTT_EPOCHS=1 \ +TTT_CHUNK_TOKENS=32768 \ +TTT_FREEZE_BLOCKS=2 \ +TTT_MOMENTUM=0.9 \ +TTT_BATCH_SEQS=32 \ +TTT_GRAD_CLIP=1.0 \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/bwing_II_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/B_wing/bwing_II/train_gpt.py b/experiments/B_wing/bwing_II/train_gpt.py new file mode 100644 index 000000000..b2beb87a5 --- /dev/null +++ b/experiments/B_wing/bwing_II/train_gpt.py @@ -0,0 +1,2321 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + ngram_order_mults_str = os.environ.get("NGRAM_ORDER_MULTS", "") # fixed per-order multipliers (comma-sep) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + # Legal score-first TTT + ttt_enabled = bool(int(os.environ.get("TTT_ENABLED", "0"))) + ttt_lr = float(os.environ.get("TTT_LR", 0.002)) + ttt_epochs = int(os.environ.get("TTT_EPOCHS", 1)) # fast: 1 epoch + ttt_chunk_tokens = int(os.environ.get("TTT_CHUNK_TOKENS", 32768)) + ttt_freeze_blocks = int(os.environ.get("TTT_FREEZE_BLOCKS", 2)) + ttt_momentum = float(os.environ.get("TTT_MOMENTUM", 0.9)) + ttt_batch_seqs = int(os.environ.get("TTT_BATCH_SEQS", 32)) + ttt_grad_clip = float(os.environ.get("TTT_GRAD_CLIP", 1.0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte + +def eval_val_sliding_ttt( + args: Hyperparameters, base_model: nn.Module, rank: int, world_size: int, + device: torch.device, val_tokens: Tensor, base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, is_boundary_token_lut: Tensor, + stride: int, batch_seqs: int = 32, log0=print, +) -> tuple[float, float]: + """Legal score-first TTT: score each chunk with sliding windows, + then train on it. Every token scored BEFORE any update that could use it.""" + seq_len = args.train_seq_len + total_tokens = val_tokens.numel() - 1 + ttt_chunk = args.ttt_chunk_tokens + + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= stride or ws == 0] + + num_chunks = (total_tokens + ttt_chunk - 1) // ttt_chunk + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // ttt_chunk, num_chunks - 1) + chunk_windows[ci].append(ws) + + log0(f"ttt_sliding:start chunks={num_chunks} chunk_tokens={ttt_chunk} " + f"total_windows={len(window_starts)} stride={stride} " + f"ttt_lr={args.ttt_lr} ttt_epochs={args.ttt_epochs} " + f"freeze_blocks={args.ttt_freeze_blocks}") + + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + + # Freeze first N blocks + frozen_block_ids = set(range(min(args.ttt_freeze_blocks, len(base_model.blocks)))) + ttt_params = [] + for name, p in base_model.named_parameters(): + freeze = False + for bi in frozen_block_ids: + if f"blocks.{bi}." in name: + freeze = True + break + if freeze: + p.requires_grad_(False) + else: + p.requires_grad_(True) + ttt_params.append(p) + + log0(f"ttt_sliding:params unfrozen={sum(p.numel() for p in ttt_params)} " + f"frozen={sum(p.numel() for p in base_model.parameters() if not p.requires_grad)}") + + optimizer = torch.optim.SGD(ttt_params, lr=args.ttt_lr, momentum=args.ttt_momentum) + t0 = time.perf_counter() + + for ci in range(num_chunks): + windows = chunk_windows[ci] + if not windows: + continue + chunk_start = ci * ttt_chunk + chunk_end = min((ci + 1) * ttt_chunk, total_tokens) + + # --- Phase 1: SCORE (inference_mode) --- + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + base_model.eval() + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk_tok = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk_tok[:-1] + y_batch[i, :wlen] = chunk_tok[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = base_model.forward_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt, prev = y_batch[i, s:wlen], x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + + # --- Phase 2: TRAIN on scored chunk (legal) --- + is_last_chunk = (ci == num_chunks - 1) + if not is_last_chunk and args.ttt_epochs > 0: + base_model.train() + chunk_seqs = (chunk_end - chunk_start) // seq_len + if chunk_seqs > 0: + cos_lr = args.ttt_lr * 0.5 * (1.0 + math.cos(math.pi * ci / max(num_chunks - 1, 1))) + for pg in optimizer.param_groups: + pg['lr'] = cos_lr + my_seq_s = (chunk_seqs * rank) // world_size + my_seq_e = (chunk_seqs * (rank + 1)) // world_size + my_chunk_seqs = my_seq_e - my_seq_s + for _ep in range(args.ttt_epochs): + for bs in range(0, my_chunk_seqs, args.ttt_batch_seqs): + be = min(bs + args.ttt_batch_seqs, my_chunk_seqs) + actual_bs = my_seq_s + bs + start_tok = chunk_start + actual_bs * seq_len + end_tok = chunk_start + (my_seq_s + be) * seq_len + 1 + if end_tok > val_tokens.numel(): + continue + local = val_tokens[start_tok:end_tok].to(device=device, dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + optimizer.zero_grad(set_to_none=True) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + loss = base_model(x, y) + loss.backward() + if world_size > 1: + for p in ttt_params: + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + torch.nn.utils.clip_grad_norm_(ttt_params, args.ttt_grad_clip) + optimizer.step() + + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1): + elapsed = time.perf_counter() - t0 + rl = loss_sum.item() / max(token_count.item(), 1) + rbpb = rl / math.log(2.0) * (token_count.item() / max(byte_count.item(), 1)) if token_count.item() > 0 else 0.0 + log0(f" ttt_chunk [{ci+1}/{num_chunks}] bpb={rbpb:.6f} time={elapsed:.1f}s") + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + + val_loss = (loss_sum / token_count).item() + val_bpb = val_loss / math.log(2.0) * (token_count.item() / byte_count.item()) + + for p in base_model.parameters(): + p.requires_grad_(True) + base_model.eval() + + log0(f"ttt_sliding:done val_loss={val_loss:.6f} val_bpb={val_bpb:.6f} " + f"elapsed={time.perf_counter() - t0:.1f}s") + return val_loss, val_bpb + +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + # Parse fixed per-order multipliers (PR #809 style) + _fixed_order_mults = None + if args.ngram_order_mults_str: + _fixed_order_mults = np.array([float(x) for x in args.ngram_order_mults_str.split(",")], dtype=np.float64) + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Mix where n-gram matched (PR #809 style or cubric 3D fallback) + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + # Per-order entropy center shift (PR #809) + if adaptive and args.ngram_entropy_shift: + matched_ords = _ng_ord[m_idx].astype(np.float64) + shifted_centers = ent_center - 0.25 * (matched_ords - float(min_order)) + shifted_sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy[m_idx] - shifted_centers))) + per_token_alpha[m_idx] = alpha_min + (alpha_max - alpha_min) * shifted_sig + if _fixed_order_mults is not None: + # PR #809 fixed order multipliers (replaces cubric) + a = per_token_alpha[m_idx].copy() + mult_indices = _ng_ord[m_idx] - min_order + mult_indices = np.clip(mult_indices, 0, len(_fixed_order_mults) - 1) + a *= _fixed_order_mults[mult_indices] + np.clip(a, 0.0, 0.95, out=a) + elif _con: + a = per_token_alpha[m_idx].copy() + m_ent_bins = _ent_bins[m_idx] + m_cnt_bins = np.digitize(_ng_ctx_count[m_idx], _CNT_EDGES).astype(np.int32) + for n in range(min_order, max_order + 1): + om = _ng_ord[m_idx] == n + if not om.any(): + continue + for eb in range(_NUM_ENT_BINS): + for cb in range(_NUM_CNT_BINS): + cell = eb * _NUM_CNT_BINS + cb + mask_ecb = om & (m_ent_bins == eb) & (m_cnt_bins == cb) + if mask_ecb.any(): + _c_hits[n][cell] += int(mask_ecb.sum()) + _c_beats[n][cell] += int((p_ng[m_idx[mask_ecb]] > seg_model_p[m_idx[mask_ecb]]).sum()) + a[mask_ecb] *= _c_alpha_mult[n][cell] + np.clip(a, 0.0, 0.95, out=a) + else: + a = per_token_alpha[m_idx] + seg_model_p[m_idx] = (1.0 - a) * seg_model_p[m_idx] + a * p_ng[m_idx] + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + # --- TTT: adapt model BEFORE n-gram eval --- + if args.ttt_enabled: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ttt = time.perf_counter() + ttt_loss, ttt_bpb = eval_val_sliding_ttt( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, batch_seqs=args.ttt_batch_seqs, log0=log0, + ) + if rank == 0: + torch.cuda.synchronize() + ttt_ms = 1000.0 * (time.perf_counter() - t_ttt) + log0(f"final_ttt val_loss:{ttt_loss:.4f} val_bpb:{ttt_bpb:.4f} eval_time:{ttt_ms:.0f}ms") + log0(f"final_ttt_exact val_loss:{ttt_loss:.8f} val_bpb:{ttt_bpb:.8f}") + if distributed: + dist.barrier() + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/B_wing/bwing_III/HYPOTHESIS.md b/experiments/B_wing/bwing_III/HYPOTHESIS.md new file mode 100644 index 000000000..21e11f8d9 --- /dev/null +++ b/experiments/B_wing/bwing_III/HYPOTHESIS.md @@ -0,0 +1,28 @@ +# B-WING FULL PORT — All #809 N-gram Techniques + +## Hypothesis +Combine all three key innovations from PR #809 onto our X-WING base: +1. Alpha curve: min=0.05, max=0.60, clip=0.95 +2. Per-order entropy center shift: -0.25*(order - min_order) +3. Fixed order multipliers: (0.3, 0.3, 0.97, 2.0, 2.0, 2.0, 2.0, 2.0) + → replaces cubric 3D adaptive system + +This is the "kitchen sink" variant. If bwing_alpha and bwing_entropy_shift +each show gains, this should stack them. + +## Changes from X-WING baseline +1. NGRAM_EVAL_ALPHA_MIN: 0.20 → 0.05 +2. NGRAM_EVAL_ALPHA_MAX: 0.75 → 0.60 +3. Alpha CLIP: 0.75 → 0.95 +4. Per-order entropy center shift +5. Fixed order multipliers replacing cubric 3D +6. Order 4 mult: 0.45 → 0.97 (big change) +7. Order 2 mult: 0.45 → 0.30 + +## Risk +Removing cubric 3D loses per-entropy-bin adaptation. But their fixed mults +work at 0.295 BPB so the risk is bounded. + +## Expected impact +Should approach their 0.295 while keeping our better base model (~1.12 vs 1.14). +Target: sub-0.30 BPB. diff --git a/experiments/B_wing/bwing_III/run.sh b/experiments/B_wing/bwing_III/run.sh new file mode 100755 index 000000000..0d9cf56f2 --- /dev/null +++ b/experiments/B_wing/bwing_III/run.sh @@ -0,0 +1,56 @@ +#!/bin/bash +set -euo pipefail +# B-WING FULL PORT: All PR #809 n-gram innovations on our X-WING base +# Changes: alpha 0.05-0.60 clip=0.95, entropy shift, fixed order mults (no cubric) + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +echo "============================================" +echo " B-WING FULL PORT — #809 N-gram Techniques" +echo " Seed: ${SEED}" +echo " Fixed order mults (no cubric)" +echo " Complementary training: alpha=0.5" +echo " Eval alpha: 0.05-0.60 clip=0.95 + entropy shift | Orders: 2-9" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=300 \ +CUBRIC_CADENCE=0 \ +NGRAM_ENTROPY_SHIFT=1 \ +NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/bwing_fullport_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/B_wing/bwing_III/train_gpt.py b/experiments/B_wing/bwing_III/train_gpt.py new file mode 100644 index 000000000..fadf6073d --- /dev/null +++ b/experiments/B_wing/bwing_III/train_gpt.py @@ -0,0 +1,2138 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + ngram_order_mults_str = os.environ.get("NGRAM_ORDER_MULTS", "") # fixed per-order multipliers (comma-sep) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + # Parse fixed per-order multipliers (PR #809 style) + _fixed_order_mults = None + if args.ngram_order_mults_str: + _fixed_order_mults = np.array([float(x) for x in args.ngram_order_mults_str.split(",")], dtype=np.float64) + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Mix where n-gram matched (PR #809 style or cubric 3D fallback) + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + # Per-order entropy center shift (PR #809) + if adaptive and args.ngram_entropy_shift: + matched_ords = _ng_ord[m_idx].astype(np.float64) + shifted_centers = ent_center - 0.25 * (matched_ords - float(min_order)) + shifted_sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy[m_idx] - shifted_centers))) + per_token_alpha[m_idx] = alpha_min + (alpha_max - alpha_min) * shifted_sig + if _fixed_order_mults is not None: + # PR #809 fixed order multipliers (replaces cubric) + a = per_token_alpha[m_idx].copy() + mult_indices = _ng_ord[m_idx] - min_order + mult_indices = np.clip(mult_indices, 0, len(_fixed_order_mults) - 1) + a *= _fixed_order_mults[mult_indices] + np.clip(a, 0.0, 0.95, out=a) + elif _con: + a = per_token_alpha[m_idx].copy() + m_ent_bins = _ent_bins[m_idx] + m_cnt_bins = np.digitize(_ng_ctx_count[m_idx], _CNT_EDGES).astype(np.int32) + for n in range(min_order, max_order + 1): + om = _ng_ord[m_idx] == n + if not om.any(): + continue + for eb in range(_NUM_ENT_BINS): + for cb in range(_NUM_CNT_BINS): + cell = eb * _NUM_CNT_BINS + cb + mask_ecb = om & (m_ent_bins == eb) & (m_cnt_bins == cb) + if mask_ecb.any(): + _c_hits[n][cell] += int(mask_ecb.sum()) + _c_beats[n][cell] += int((p_ng[m_idx[mask_ecb]] > seg_model_p[m_idx[mask_ecb]]).sum()) + a[mask_ecb] *= _c_alpha_mult[n][cell] + np.clip(a, 0.0, 0.95, out=a) + else: + a = per_token_alpha[m_idx] + seg_model_p[m_idx] = (1.0 - a) * seg_model_p[m_idx] + a * p_ng[m_idx] + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/B_wing/bwing_III/train_seed1337.log b/experiments/B_wing/bwing_III/train_seed1337.log new file mode 100644 index 000000000..0b4a07a5e --- /dev/null +++ b/experiments/B_wing/bwing_III/train_seed1337.log @@ -0,0 +1,104 @@ +============================================ + B-WING FULL PORT — #809 N-gram Techniques + Seed: 1337 + Fixed order mults (no cubric) + Complementary training: alpha=0.5 + Eval alpha: 0.05-0.60 clip=0.95 + entropy shift | Orders: 2-9 +============================================ +W0326 05:38:58.867000 1640 torch/distributed/run.py:803] +W0326 05:38:58.867000 1640 torch/distributed/run.py:803] ***************************************** +W0326 05:38:58.867000 1640 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0326 05:38:58.867000 1640 torch/distributed/run.py:803] ***************************************** +logs/b93ddcc1-5257-48ca-9542-081180067ac8.txt +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +complementary_training:alpha=0.5 +model_params:26928220 +f1_corr:rank=0 params=0 est_int6_bytes~0 +mlp_act:leaky_relu_sq mlp_leaky_slope:0.5 +XSA:last_4 world_size:8 grad_accum_steps:1 +num_heads:8 num_kv_heads:4 embed_lr:0.035 matrix_lr:0.025 +train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 +compile:enabled=1 fullgraph=0 +seed:1337 +ngram_eval:order=9 alpha=0.3 min_count=2 buckets=8388608 +warmup_step:1/20 +warmup_step:2/20 +warmup_step:3/20 +warmup_step:4/20 +warmup_step:5/20 +warmup_step:6/20 +warmup_step:7/20 +warmup_step:8/20 +warmup_step:9/20 +warmup_step:10/20 +warmup_step:11/20 +warmup_step:12/20 +warmup_step:13/20 +warmup_step:14/20 +warmup_step:15/20 +warmup_step:16/20 +warmup_step:17/20 +warmup_step:18/20 +warmup_step:19/20 +warmup_step:20/20 +step:0/20000 val_loss:6.9317 val_bpb:4.1054 train_time:0ms step_avg:0.02ms +step:1/20000 train_loss:6.9343 train_time:149ms step_avg:149.31ms +step:2/20000 train_loss:8.6212 train_time:232ms step_avg:115.92ms +step:3/20000 train_loss:7.8208 train_time:318ms step_avg:105.93ms +step:4/20000 train_loss:7.1066 train_time:404ms step_avg:100.89ms +step:5/20000 train_loss:6.8530 train_time:489ms step_avg:97.86ms +step:6/20000 train_loss:6.7961 train_time:575ms step_avg:95.83ms +step:7/20000 train_loss:6.6784 train_time:660ms step_avg:94.31ms +step:8/20000 train_loss:6.5596 train_time:746ms step_avg:93.25ms +step:9/20000 train_loss:6.2554 train_time:833ms step_avg:92.52ms +step:10/20000 train_loss:5.9365 train_time:918ms step_avg:91.82ms +step:1000/20000 train_loss:2.2352 train_time:87900ms step_avg:87.90ms +step:2000/20000 train_loss:2.0277 train_time:175924ms step_avg:87.96ms +step:3000/20000 train_loss:2.1245 train_time:263953ms step_avg:87.98ms +step:4000/20000 train_loss:1.9353 train_time:351962ms step_avg:87.99ms +step:5000/20000 train_loss:2.0680 train_time:439941ms step_avg:87.99ms +late_qat:enabled step:5070 scale:0.4999 +step:6000/20000 train_loss:1.9024 train_time:527953ms step_avg:87.99ms +swa:start step:6200 +step:6817/20000 val_loss:1.9221 val_bpb:1.1384 train_time:600020ms step_avg:88.02ms +stopping_early: wallclock_cap train_time:600020ms step:6817/20000 +peak memory allocated: 20677 MiB reserved: 20718 MiB +gptq:calibrating with training data... +gptq:calibrated 68 layers in 3.7s +ema:applying EMA weights +DIAGNOSTIC post_ema val_loss:1.9205 val_bpb:1.1374 eval_time:2027ms +Serialized model: 106047497 bytes +Code size: 106155 bytes +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +Serialized model int6+zstd: 15991916 bytes +Total submission size int6+zstd: 16098071 bytes +Total submission size int8+zlib: 16098071 bytes +final_int6_roundtrip val_loss:1.9301 val_bpb:1.1431 eval_time:37099ms +final_int6_roundtrip_exact val_loss:1.93013868 val_bpb:1.14313685 +final_int6_sliding_window val_loss:1.8901 val_bpb:1.1194 stride:64 eval_time:96435ms +final_int6_sliding_window_exact val_loss:1.89013592 val_bpb:1.11944792 +final_int8_zlib_roundtrip_exact val_loss:1.89013592 val_bpb:1.11944792 +ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True +ngram_eval:chunk [1/60] bpb=1.130307 t=15s +ngram_eval:chunk [2/60] bpb=1.211256 t=18s +ngram_eval:chunk [3/60] bpb=1.235629 t=21s +ngram_eval:chunk [11/60] bpb=1.149570 t=43s +ngram_eval:chunk [21/60] bpb=0.876947 t=70s +ngram_eval:chunk [31/60] bpb=0.694595 t=96s +ngram_eval:chunk [41/60] bpb=0.575851 t=121s +ngram_eval:chunk [51/60] bpb=0.497954 t=146s +ngram_eval:chunk [60/60] bpb=0.450898 t=178s +final_int6_sliding_window_ngram9 val_loss:0.7618 val_bpb:0.4512 eval_time:178896ms +final_int6_sliding_window_ngram9_exact val_loss:0.76181150 val_bpb:0.45118888 +============================================ + DONE +============================================ diff --git a/experiments/B_wing/bwing_IV/run.sh b/experiments/B_wing/bwing_IV/run.sh new file mode 100755 index 000000000..d4456844e --- /dev/null +++ b/experiments/B_wing/bwing_IV/run.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set -euo pipefail +# B-WING IV: 9-Prime Hash Fix (was 7 — orders 8-9 had collisions) +# Single change from SOTA bwing_full_port: 2 extra hash primes (283721, 347237) + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +echo "============================================" +echo " B-WING IV — 9-Prime Hash Fix" +echo " Seed: ${SEED}" +echo " Fixed order mults + entropy shift (no cubric)" +echo " CHANGE: 9 hash primes (was 7 — fixes order 8-9 collisions)" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=300 \ +CUBRIC_CADENCE=0 \ +NGRAM_ENTROPY_SHIFT=1 \ +NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/bwing_IV_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/B_wing/bwing_IV/train_gpt.py b/experiments/B_wing/bwing_IV/train_gpt.py new file mode 100644 index 000000000..b29643b7d --- /dev/null +++ b/experiments/B_wing/bwing_IV/train_gpt.py @@ -0,0 +1,2139 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + ngram_order_mults_str = os.environ.get("NGRAM_ORDER_MULTS", "") # fixed per-order multipliers (comma-sep) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + # Parse fixed per-order multipliers (PR #809 style) + _fixed_order_mults = None + if args.ngram_order_mults_str: + _fixed_order_mults = np.array([float(x) for x in args.ngram_order_mults_str.split(",")], dtype=np.float64) + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017), np.uint64(283721), + np.uint64(347237)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Mix where n-gram matched (PR #809 style or cubric 3D fallback) + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + # Per-order entropy center shift (PR #809) + if adaptive and args.ngram_entropy_shift: + matched_ords = _ng_ord[m_idx].astype(np.float64) + shifted_centers = ent_center - 0.25 * (matched_ords - float(min_order)) + shifted_sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy[m_idx] - shifted_centers))) + per_token_alpha[m_idx] = alpha_min + (alpha_max - alpha_min) * shifted_sig + if _fixed_order_mults is not None: + # PR #809 fixed order multipliers (replaces cubric) + a = per_token_alpha[m_idx].copy() + mult_indices = _ng_ord[m_idx] - min_order + mult_indices = np.clip(mult_indices, 0, len(_fixed_order_mults) - 1) + a *= _fixed_order_mults[mult_indices] + np.clip(a, 0.0, 0.95, out=a) + elif _con: + a = per_token_alpha[m_idx].copy() + m_ent_bins = _ent_bins[m_idx] + m_cnt_bins = np.digitize(_ng_ctx_count[m_idx], _CNT_EDGES).astype(np.int32) + for n in range(min_order, max_order + 1): + om = _ng_ord[m_idx] == n + if not om.any(): + continue + for eb in range(_NUM_ENT_BINS): + for cb in range(_NUM_CNT_BINS): + cell = eb * _NUM_CNT_BINS + cb + mask_ecb = om & (m_ent_bins == eb) & (m_cnt_bins == cb) + if mask_ecb.any(): + _c_hits[n][cell] += int(mask_ecb.sum()) + _c_beats[n][cell] += int((p_ng[m_idx[mask_ecb]] > seg_model_p[m_idx[mask_ecb]]).sum()) + a[mask_ecb] *= _c_alpha_mult[n][cell] + np.clip(a, 0.0, 0.95, out=a) + else: + a = per_token_alpha[m_idx] + seg_model_p[m_idx] = (1.0 - a) * seg_model_p[m_idx] + a * p_ng[m_idx] + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/B_wing/bwing_V/eval_sweep.py b/experiments/B_wing/bwing_V/eval_sweep.py new file mode 100644 index 000000000..12eced8bf --- /dev/null +++ b/experiments/B_wing/bwing_V/eval_sweep.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python3 +"""Grid sweep over n-gram eval parameters on a saved quantized model. + +Loads final_model.int6.ptz once, then runs eval_val_sliding_hashed_ngram +with each parameter combination. Results written to CSV. + +Usage: + torchrun --standalone --nproc_per_node=8 experiments/B_wing/bwing_V/eval_sweep.py +""" +from __future__ import annotations +import csv +import importlib.util +import io +import itertools +import math +import os +import sys +import time +import zlib +from pathlib import Path + +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist + +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" + +# --------------------------------------------------------------------------- +# Import train_gpt as a module (without running main) +# --------------------------------------------------------------------------- +SCRIPT_DIR = Path(__file__).resolve().parent +TRAIN_SCRIPT = SCRIPT_DIR / "train_gpt.py" + +spec = importlib.util.spec_from_file_location("train_gpt", str(TRAIN_SCRIPT)) +tg = importlib.util.module_from_spec(spec) +tg.__name__ = "train_gpt" # prevent __main__ execution +spec.loader.exec_module(tg) + +# --------------------------------------------------------------------------- +# Grid definition — edit these to change the sweep +# --------------------------------------------------------------------------- +GRID = { + "alpha_max": [0.50, 0.60, 0.70, 0.80], + "entropy_center": [2.0, 2.5, 3.0], + "high_order_mult": [1.5, 2.0, 2.5, 3.0], + "min_count": [1, 2], + "cubric": [0, 1], +} + +# Fixed params (not swept) +ALPHA_MIN = 0.03 +ENTROPY_SCALE = 2.0 +ENTROPY_SHIFT = True +LOW_ORDER_MULTS = (0.3, 0.3, 0.97) # orders 2, 3, 4 — always same +BUCKETS = 8_388_608 +ORDER = 9 +MIN_ORDER = 2 +STRIDE = 64 + + +def build_order_mults(low: tuple, high_mult: float) -> str: + """Build comma-separated order mults string. Orders 5-9 get high_mult.""" + return ",".join(str(x) for x in list(low) + [high_mult] * 5) + + +def main(): + # Distributed setup + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl") + master = rank == 0 + + def log0(msg): + if master: + print(msg, flush=True) + + # Load tokenizer + val data (once) + args = tg.Hyperparameters() + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + val_tokens = tg.load_validation_tokens(args.val_files, args.train_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = tg.build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_tokens:{val_tokens.numel()-1}") + + # Build fresh model for template shapes → dequantize + tg.CastedLinear._qat_enabled = args.qat_enabled + template_model = tg.GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, + dtg=args.dtg_enabled, ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, + ve_layers=args.ve_layers, mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in template_model.modules(): + if isinstance(m, tg.CastedLinear): + m.float() + tg.restore_low_dim_params_to_fp32(template_model) + sd_cpu = {k: v.detach().cpu() for k, v in template_model.state_dict().items() if "mtp_heads" not in k} + + # Load quantized weights + log0("loading final_model.int6.ptz...") + with open("final_model.int6.ptz", "rb") as f: + quant_blob = f.read() + if _COMPRESSOR == "zstd": + raw = zstandard.ZstdDecompressor().decompress(quant_blob) + else: + raw = zlib.decompress(quant_blob) + quant_state = torch.load(io.BytesIO(raw), map_location="cpu") + deq_state = tg.dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + + # Build eval model + eval_model = tg.GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, + dtg=args.dtg_enabled, ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, + ve_layers=args.ve_layers, mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, tg.CastedLinear): + m.float() + tg.restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + del template_model, sd_cpu, deq_state, quant_state # free memory + torch.cuda.empty_cache() + + log0("model loaded. starting sweep...") + + # Build all grid combos, sorted by expected impact (high alpha_max + high mult first) + keys = list(GRID.keys()) + combos = list(itertools.product(*[GRID[k] for k in keys])) + combos_dicts = [dict(zip(keys, vals)) for vals in combos] + # Sort: highest alpha_max * highest high_order_mult first (most aggressive configs first) + combos_dicts.sort(key=lambda c: -(c["alpha_max"] * c["high_order_mult"])) + + total = len(combos_dicts) + log0(f"sweep:{total} configs") + + # CSV output + csv_path = SCRIPT_DIR / "sweep_results.csv" + if master: + with open(csv_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["idx", "alpha_min", "alpha_max", "entropy_center", "entropy_scale", + "high_order_mult", "order_mults", "min_count", "cubric", + "entropy_shift", "bpb", "eval_time_s"]) + + best_bpb = float("inf") + best_config = None + + for i, cfg in enumerate(combos_dicts): + # Build args overlay + args.ngram_eval_alpha_min = ALPHA_MIN + args.ngram_eval_alpha_max = cfg["alpha_max"] + args.ngram_eval_entropy_center = cfg["entropy_center"] + args.ngram_eval_entropy_scale = ENTROPY_SCALE + args.ngram_eval_min_count = cfg["min_count"] + args.ngram_eval_adaptive = True + args.ngram_entropy_shift = ENTROPY_SHIFT + args.cubric_cadence = cfg["cubric"] + + mults_str = build_order_mults(LOW_ORDER_MULTS, cfg["high_order_mult"]) + args.ngram_order_mults_str = mults_str + + if distributed: + dist.barrier() + torch.cuda.synchronize() + t0 = time.perf_counter() + + ng_loss, ng_bpb, ng_coverage = tg.eval_val_sliding_hashed_ngram( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=STRIDE, order=ORDER, alpha=0.30, + min_count=cfg["min_count"], buckets=BUCKETS, + max_seconds=0.0, eval_seq_len=args.train_seq_len, + ) + + elapsed = time.perf_counter() - t0 + + if master: + tag = "" + if ng_bpb < best_bpb: + best_bpb = ng_bpb + best_config = cfg + tag = " *** NEW BEST ***" + + log0( + f"[{i+1}/{total}] bpb={ng_bpb:.6f} " + f"amax={cfg['alpha_max']:.2f} ec={cfg['entropy_center']:.1f} " + f"hm={cfg['high_order_mult']:.1f} mc={cfg['min_count']} " + f"cub={cfg['cubric']} t={elapsed:.0f}s{tag}" + ) + + with open(csv_path, "a", newline="") as f: + writer = csv.writer(f) + writer.writerow([ + i + 1, ALPHA_MIN, cfg["alpha_max"], cfg["entropy_center"], + ENTROPY_SCALE, cfg["high_order_mult"], mults_str, + cfg["min_count"], cfg["cubric"], int(ENTROPY_SHIFT), + f"{ng_bpb:.8f}", f"{elapsed:.1f}", + ]) + + # Final summary + if master: + log0("=" * 60) + log0(f"BEST BPB: {best_bpb:.6f}") + log0(f"CONFIG: {best_config}") + log0(f"results saved to {csv_path}") + log0("=" * 60) + + if distributed: + dist.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/experiments/B_wing/bwing_V/run.sh b/experiments/B_wing/bwing_V/run.sh new file mode 100755 index 000000000..70990dd1f --- /dev/null +++ b/experiments/B_wing/bwing_V/run.sh @@ -0,0 +1,56 @@ +#!/bin/bash +set -euo pipefail +# B-WING V: 9-Prime Hash Fix + Cubric 3D on top of Fixed Mults +# Changes from SOTA: 2 extra hash primes + cubric refines per (order x entropy x count) +# Cubric warm-starts at 1.0 (neutral) since fixed mults handle base scaling + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +echo "============================================" +echo " B-WING V — 9-Prime + Cubric 3D + Fixed Mults" +echo " Seed: ${SEED}" +echo " Fixed mults -> cubric refinement -> clip 0.95" +echo " CHANGE: 9 primes + cubric ON (stacked, not either/or)" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=0 \ +CUBRIC_CADENCE=1 \ +NGRAM_ENTROPY_SHIFT=1 \ +NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/bwing_V_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/B_wing/bwing_V/sweep.sh b/experiments/B_wing/bwing_V/sweep.sh new file mode 100755 index 000000000..b5313ea8c --- /dev/null +++ b/experiments/B_wing/bwing_V/sweep.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -euo pipefail +# N-gram parameter grid sweep on saved bwing_V model +# Loads final_model.int6.ptz once, runs ~192 eval configs (~3 min each) +# Results: experiments/B_wing/bwing_V/sweep_results.csv + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +echo "============================================" +echo " B-WING V — N-gram Parameter Sweep" +echo " Model: final_model.int6.ptz (from bwing_V run)" +echo " Grid: alpha_max × entropy_center × high_order_mult × min_count × cubric" +echo "============================================" + +# Base env vars for model architecture (must match training) +SEED=1337 \ +F1_CORR_RANK=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +ROPE_DIMS=24 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_ENTROPY_SHIFT=1 \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/eval_sweep.py" \ + 2>&1 | tee "${SCRIPT_DIR}/sweep_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " SWEEP DONE — check sweep_results.csv" +echo "============================================" diff --git a/experiments/B_wing/bwing_V/train_gpt.py b/experiments/B_wing/bwing_V/train_gpt.py new file mode 100644 index 000000000..90d9d9309 --- /dev/null +++ b/experiments/B_wing/bwing_V/train_gpt.py @@ -0,0 +1,2135 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + ngram_order_mults_str = os.environ.get("NGRAM_ORDER_MULTS", "") # fixed per-order multipliers (comma-sep) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + # Parse fixed per-order multipliers (PR #809 style) + _fixed_order_mults = None + if args.ngram_order_mults_str: + _fixed_order_mults = np.array([float(x) for x in args.ngram_order_mults_str.split(",")], dtype=np.float64) + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017), np.uint64(283721), + np.uint64(347237)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start at 1.0 (neutral) — fixed order mults handle base scaling, + # cubric 3D refines per (order × entropy × count) on top + _c_alpha_mult = {n: [1.0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Mix where n-gram matched: fixed mults → cubric refinement → clip + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + # Per-order entropy center shift (PR #809) + if adaptive and args.ngram_entropy_shift: + matched_ords = _ng_ord[m_idx].astype(np.float64) + shifted_centers = ent_center - 0.25 * (matched_ords - float(min_order)) + shifted_sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy[m_idx] - shifted_centers))) + per_token_alpha[m_idx] = alpha_min + (alpha_max - alpha_min) * shifted_sig + a = per_token_alpha[m_idx].copy() + # Step 1: fixed order multipliers (coarse per-order scaling) + if _fixed_order_mults is not None: + mult_indices = _ng_ord[m_idx] - min_order + mult_indices = np.clip(mult_indices, 0, len(_fixed_order_mults) - 1) + a *= _fixed_order_mults[mult_indices] + # Step 2: cubric 3D refinement (fine per entropy×count adaptation) + if _con: + m_ent_bins = _ent_bins[m_idx] + m_cnt_bins = np.digitize(_ng_ctx_count[m_idx], _CNT_EDGES).astype(np.int32) + for n in range(min_order, max_order + 1): + om = _ng_ord[m_idx] == n + if not om.any(): + continue + for eb in range(_NUM_ENT_BINS): + for cb in range(_NUM_CNT_BINS): + cell = eb * _NUM_CNT_BINS + cb + mask_ecb = om & (m_ent_bins == eb) & (m_cnt_bins == cb) + if mask_ecb.any(): + _c_hits[n][cell] += int(mask_ecb.sum()) + _c_beats[n][cell] += int((p_ng[m_idx[mask_ecb]] > seg_model_p[m_idx[mask_ecb]]).sum()) + a[mask_ecb] *= _c_alpha_mult[n][cell] + np.clip(a, 0.0, 0.95, out=a) + seg_model_p[m_idx] = (1.0 - a) * seg_model_p[m_idx] + a * p_ng[m_idx] + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/B_wing/bwing_V/train_seed1337.log b/experiments/B_wing/bwing_V/train_seed1337.log new file mode 100644 index 000000000..f31352859 --- /dev/null +++ b/experiments/B_wing/bwing_V/train_seed1337.log @@ -0,0 +1,119 @@ +============================================ + B-WING V — 9-Prime + Cubric 3D + Fixed Mults + Seed: 1337 + Fixed mults -> cubric refinement -> clip 0.95 + CHANGE: 9 primes + cubric ON (stacked, not either/or) +============================================ +W0326 06:58:22.607000 59027 torch/distributed/run.py:803] +W0326 06:58:22.607000 59027 torch/distributed/run.py:803] ***************************************** +W0326 06:58:22.607000 59027 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0326 06:58:22.607000 59027 torch/distributed/run.py:803] ***************************************** +logs/b2c56a2a-8e7b-49e8-a985-468fc98b29d8.txt +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +complementary_training:alpha=0.5 +model_params:26928220 +f1_corr:rank=0 params=0 est_int6_bytes~0 +mlp_act:leaky_relu_sq mlp_leaky_slope:0.5 +XSA:last_4 world_size:8 grad_accum_steps:1 +num_heads:8 num_kv_heads:4 embed_lr:0.035 matrix_lr:0.025 +train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 +compile:enabled=1 fullgraph=0 +seed:1337 +ngram_eval:order=9 alpha=0.3 min_count=2 buckets=8388608 +warmup_step:1/20 +warmup_step:2/20 +warmup_step:3/20 +warmup_step:4/20 +warmup_step:5/20 +warmup_step:6/20 +warmup_step:7/20 +warmup_step:8/20 +warmup_step:9/20 +warmup_step:10/20 +warmup_step:11/20 +warmup_step:12/20 +warmup_step:13/20 +warmup_step:14/20 +warmup_step:15/20 +warmup_step:16/20 +warmup_step:17/20 +warmup_step:18/20 +warmup_step:19/20 +warmup_step:20/20 +step:0/20000 val_loss:6.9317 val_bpb:4.1054 train_time:0ms step_avg:0.02ms +step:1/20000 train_loss:6.9343 train_time:148ms step_avg:147.99ms +step:2/20000 train_loss:8.6212 train_time:230ms step_avg:114.83ms +step:3/20000 train_loss:7.8208 train_time:316ms step_avg:105.45ms +step:4/20000 train_loss:7.1065 train_time:403ms step_avg:100.64ms +step:5/20000 train_loss:6.8531 train_time:489ms step_avg:97.72ms +step:6/20000 train_loss:6.7963 train_time:574ms step_avg:95.68ms +step:7/20000 train_loss:6.6788 train_time:660ms step_avg:94.27ms +step:8/20000 train_loss:6.5597 train_time:746ms step_avg:93.20ms +step:9/20000 train_loss:6.2556 train_time:831ms step_avg:92.35ms +step:10/20000 train_loss:5.9364 train_time:917ms step_avg:91.68ms +step:1000/20000 train_loss:2.2389 train_time:87831ms step_avg:87.83ms +step:2000/20000 train_loss:2.0275 train_time:175846ms step_avg:87.92ms +step:3000/20000 train_loss:2.1272 train_time:263855ms step_avg:87.95ms +step:4000/20000 train_loss:1.9376 train_time:351781ms step_avg:87.95ms +step:5000/20000 train_loss:2.0655 train_time:439690ms step_avg:87.94ms +late_qat:enabled step:5074 scale:0.4998 +step:6000/20000 train_loss:1.9050 train_time:527550ms step_avg:87.93ms +swa:start step:6200 +step:6822/20000 val_loss:1.9220 val_bpb:1.1383 train_time:600085ms step_avg:87.96ms +stopping_early: wallclock_cap train_time:600085ms step:6822/20000 +peak memory allocated: 20677 MiB reserved: 20718 MiB +gptq:calibrating with training data... +gptq:calibrated 68 layers in 3.5s +ema:applying EMA weights +DIAGNOSTIC post_ema val_loss:1.9204 val_bpb:1.1374 eval_time:2126ms +Serialized model: 106047497 bytes +Code size: 105978 bytes +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +Serialized model int6+zstd: 15487333 bytes +Total submission size int6+zstd: 15593311 bytes +Total submission size int8+zlib: 15593311 bytes +final_int5_roundtrip val_loss:1.9297 val_bpb:1.1429 eval_time:39444ms +final_int5_roundtrip_exact val_loss:1.92973725 val_bpb:1.14289909 +final_int5_sliding_window val_loss:1.8898 val_bpb:1.1193 stride:64 eval_time:98943ms +final_int5_sliding_window_exact val_loss:1.88984081 val_bpb:1.11927314 +final_int8_zlib_roundtrip_exact val_loss:1.88984081 val_bpb:1.11927314 +ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True +ngram_eval:chunk [1/60] bpb=1.128406 t=15s +ngram_eval:chunk [2/60] bpb=1.212192 t=19s +ngram_eval:chunk [3/60] bpb=1.236329 t=22s +cubric3d:step=8 o2:avg=0.93 o3:avg=0.85 o4:avg=0.98 o5:avg=1.05 o6:avg=1.04 o7:avg=1.03 o8:avg=1.05 o9:avg=1.06 +ngram_eval:chunk [11/60] bpb=1.147661 t=51s +cubric3d:step=16 o2:avg=0.87 o3:avg=0.69 o4:avg=0.99 o5:avg=1.14 o6:avg=1.12 o7:avg=1.13 o8:avg=1.13 o9:avg=1.15 +ngram_eval:chunk [21/60] bpb=0.875098 t=83s +cubric3d:step=24 o2:avg=0.86 o3:avg=0.62 o4:avg=0.98 o5:avg=1.25 o6:avg=1.25 o7:avg=1.26 o8:avg=1.27 o9:avg=1.26 +ngram_eval:chunk [31/60] bpb=0.694500 t=112s +cubric3d:step=32 o2:avg=0.86 o3:avg=0.62 o4:avg=0.98 o5:avg=1.28 o6:avg=1.32 o7:avg=1.30 o8:avg=1.30 o9:avg=1.31 +cubric3d:step=40 o2:avg=0.86 o3:avg=0.62 o4:avg=0.98 o5:avg=1.28 o6:avg=1.29 o7:avg=1.29 o8:avg=1.27 o9:avg=1.27 +ngram_eval:chunk [41/60] bpb=0.578262 t=140s +cubric3d:step=48 o2:avg=0.86 o3:avg=0.62 o4:avg=0.98 o5:avg=1.28 o6:avg=1.29 o7:avg=1.29 o8:avg=1.27 o9:avg=1.26 +ngram_eval:chunk [51/60] bpb=0.503955 t=166s +cubric3d:step=56 o2:avg=0.86 o3:avg=0.62 o4:avg=0.98 o5:avg=1.28 o6:avg=1.29 o7:avg=1.29 o8:avg=1.29 o9:avg=1.29 +ngram_eval:chunk [60/60] bpb=0.460012 t=199s +cubric3d:final c_steps=60 cells=9x8=72 + o2: [0.97 0.89 0.58 1.00 0.94 0.63 1.00 0.97 0.76] + o3: [0.63 0.53 0.48 0.63 0.56 0.51 0.67 0.81 0.71] + o4: [1.00 0.47 0.47 1.56 0.88 0.53 1.23 1.60 1.09] + o5: [0.91 0.48 0.48 2.00 1.70 0.56 1.80 1.97 1.60] + o6: [0.88 0.39 0.47 2.00 1.94 0.63 2.00 2.00 1.30] + o7: [0.94 0.30 0.44 2.00 2.00 0.71 2.00 2.00 1.27] + o8: [1.29 0.30 0.39 2.00 2.00 0.78 2.00 2.00 1.00] + o9: [1.37 0.30 0.30 2.00 2.00 0.30 2.00 2.00 1.55] +final_int5_sliding_window_ngram9 val_loss:0.7769 val_bpb:0.4601 eval_time:205134ms +final_int5_sliding_window_ngram9_exact val_loss:0.77691499 val_bpb:0.46013404 +============================================ + DONE +============================================ diff --git a/experiments/B_wing/bwing_alpha/HYPOTHESIS.md b/experiments/B_wing/bwing_alpha/HYPOTHESIS.md new file mode 100644 index 000000000..7496963ff --- /dev/null +++ b/experiments/B_wing/bwing_alpha/HYPOTHESIS.md @@ -0,0 +1,22 @@ +# B-WING ALPHA — Fix the Alpha Curve + +## Hypothesis +Our alpha clamp (0.75) is leaving massive BPB on the table. PR #809 clips at 0.95, +meaning high-order n-gram matches can almost fully override the model. Combined with +a lower floor (0.05 vs our 0.20), confident model predictions stay clean while +uncertain tokens get aggressively n-gram'd. + +## Changes from X-WING baseline +1. NGRAM_EVAL_ALPHA_MIN: 0.20 → 0.05 +2. NGRAM_EVAL_ALPHA_MAX: 0.75 → 0.60 +3. Alpha CLIP max: 0.75 → 0.95 (in the cubric clip line) +4. Keep cubric 3D adaptive system and warm starts + +## Expected impact +The alpha clip alone should be worth 0.05-0.10 BPB. +The floor fix prevents over-mixing on confident model tokens. + +## What NOT to change +- Keep our cubric 3D system (they don't have it, this is our edge) +- Keep our architecture, training, everything else identical +- Keep entropy center at 3.0 (same as theirs) diff --git a/experiments/B_wing/bwing_alpha/run.sh b/experiments/B_wing/bwing_alpha/run.sh new file mode 100755 index 000000000..5091c7ba0 --- /dev/null +++ b/experiments/B_wing/bwing_alpha/run.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set -euo pipefail +# B-WING ALPHA: Fix alpha curve from PR #809 +# Changes: alpha_min 0.20→0.05, alpha_max 0.75→0.60, clip 0.75→0.95 +# Keep cubric 3D, keep everything else from X-WING + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +echo "============================================" +echo " B-WING ALPHA — Alpha Curve Fix" +echo " Seed: ${SEED}" +echo " 3D cubric: order × entropy × count (54 mults)" +echo " Complementary training: alpha=0.5" +echo " Eval alpha: 0.05-0.60 clip=0.95 | Orders: 2-9" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=300 \ +CUBRIC_CADENCE="${CUBRIC_CADENCE:-32}" \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/bwing_alpha_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/B_wing/bwing_alpha/train_gpt.py b/experiments/B_wing/bwing_alpha/train_gpt.py new file mode 100644 index 000000000..b98a73921 --- /dev/null +++ b/experiments/B_wing/bwing_alpha/train_gpt.py @@ -0,0 +1,2118 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Mix where n-gram matched (cubric 3D: order × entropy_bin × count_bin) + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + if _con: + a = per_token_alpha[m_idx].copy() + m_ent_bins = _ent_bins[m_idx] + m_cnt_bins = np.digitize(_ng_ctx_count[m_idx], _CNT_EDGES).astype(np.int32) + for n in range(min_order, max_order + 1): + om = _ng_ord[m_idx] == n + if not om.any(): + continue + for eb in range(_NUM_ENT_BINS): + for cb in range(_NUM_CNT_BINS): + cell = eb * _NUM_CNT_BINS + cb + mask_ecb = om & (m_ent_bins == eb) & (m_cnt_bins == cb) + if mask_ecb.any(): + _c_hits[n][cell] += int(mask_ecb.sum()) + _c_beats[n][cell] += int((p_ng[m_idx[mask_ecb]] > seg_model_p[m_idx[mask_ecb]]).sum()) + a[mask_ecb] *= _c_alpha_mult[n][cell] + np.clip(a, 0.0, 0.95, out=a) + else: + a = per_token_alpha[m_idx] + seg_model_p[m_idx] = (1.0 - a) * seg_model_p[m_idx] + a * p_ng[m_idx] + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/B_wing/bwing_entropy_shift/HYPOTHESIS.md b/experiments/B_wing/bwing_entropy_shift/HYPOTHESIS.md new file mode 100644 index 000000000..2d2c8d5bf --- /dev/null +++ b/experiments/B_wing/bwing_entropy_shift/HYPOTHESIS.md @@ -0,0 +1,23 @@ +# B-WING ENTROPY-SHIFT — Per-Order Center Shift + +## Hypothesis +PR #809 shifts the entropy sigmoid center DOWN for higher orders: + center = entropy_center - 0.25 * (order - min_order) + +For order 9, min_order 2: center = 3.0 - 0.25*7 = 1.25 +This means even when the model is fairly confident (entropy ~1.5), high-order matches +still get substantial alpha. Our flat center=3.0 for all orders means high-order matches +on confident tokens get almost no alpha boost. + +## Changes from X-WING baseline +1. Add per-order entropy center shift: center = ent_center - 0.25*(order - min_order) +2. Keep everything else identical to X-WING baseline + +## Expected impact +Should help most on "easy" tokens where the model is confident but an 8/9-gram +match provides even better information. These tokens are currently under-mixed. + +## What NOT to change +- Keep alpha range at 0.20-0.75 (isolate this variable) +- Keep cubric 3D +- Keep architecture diff --git a/experiments/B_wing/bwing_entropy_shift/run.sh b/experiments/B_wing/bwing_entropy_shift/run.sh new file mode 100755 index 000000000..676387b5e --- /dev/null +++ b/experiments/B_wing/bwing_entropy_shift/run.sh @@ -0,0 +1,56 @@ +#!/bin/bash +set -euo pipefail +# B-WING ENTROPY SHIFT: Per-order entropy center shift from PR #809 +# Changes: entropy center shifts DOWN for higher orders +# Keep alpha range at 0.20-0.75, keep cubric 3D (isolate this variable) + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +echo "============================================" +echo " B-WING ENTROPY SHIFT — Per-Order Center" +echo " Seed: ${SEED}" +echo " 3D cubric: order × entropy × count (54 mults)" +echo " Complementary training: alpha=0.5" +echo " Eval alpha: 0.20-0.75 + entropy shift | Orders: 2-9" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.20 \ +NGRAM_EVAL_ALPHA_MAX=0.75 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=300 \ +CUBRIC_CADENCE="${CUBRIC_CADENCE:-32}" \ +NGRAM_ENTROPY_SHIFT=1 \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/bwing_entshift_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/B_wing/bwing_entropy_shift/train_gpt.py b/experiments/B_wing/bwing_entropy_shift/train_gpt.py new file mode 100644 index 000000000..01be48c74 --- /dev/null +++ b/experiments/B_wing/bwing_entropy_shift/train_gpt.py @@ -0,0 +1,2125 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Mix where n-gram matched (cubric 3D: order × entropy_bin × count_bin) + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + # Per-order entropy center shift (PR #809 technique) + if adaptive and args.ngram_entropy_shift: + matched_ords = _ng_ord[m_idx].astype(np.float64) + shifted_centers = ent_center - 0.25 * (matched_ords - float(min_order)) + shifted_sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy[m_idx] - shifted_centers))) + per_token_alpha[m_idx] = alpha_min + (alpha_max - alpha_min) * shifted_sig + if _con: + a = per_token_alpha[m_idx].copy() + m_ent_bins = _ent_bins[m_idx] + m_cnt_bins = np.digitize(_ng_ctx_count[m_idx], _CNT_EDGES).astype(np.int32) + for n in range(min_order, max_order + 1): + om = _ng_ord[m_idx] == n + if not om.any(): + continue + for eb in range(_NUM_ENT_BINS): + for cb in range(_NUM_CNT_BINS): + cell = eb * _NUM_CNT_BINS + cb + mask_ecb = om & (m_ent_bins == eb) & (m_cnt_bins == cb) + if mask_ecb.any(): + _c_hits[n][cell] += int(mask_ecb.sum()) + _c_beats[n][cell] += int((p_ng[m_idx[mask_ecb]] > seg_model_p[m_idx[mask_ecb]]).sum()) + a[mask_ecb] *= _c_alpha_mult[n][cell] + np.clip(a, 0.0, alpha_max, out=a) + else: + a = per_token_alpha[m_idx] + seg_model_p[m_idx] = (1.0 - a) * seg_model_p[m_idx] + a * p_ng[m_idx] + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/B_wing/bwing_full_port/HYPOTHESIS.md b/experiments/B_wing/bwing_full_port/HYPOTHESIS.md new file mode 100644 index 000000000..21e11f8d9 --- /dev/null +++ b/experiments/B_wing/bwing_full_port/HYPOTHESIS.md @@ -0,0 +1,28 @@ +# B-WING FULL PORT — All #809 N-gram Techniques + +## Hypothesis +Combine all three key innovations from PR #809 onto our X-WING base: +1. Alpha curve: min=0.05, max=0.60, clip=0.95 +2. Per-order entropy center shift: -0.25*(order - min_order) +3. Fixed order multipliers: (0.3, 0.3, 0.97, 2.0, 2.0, 2.0, 2.0, 2.0) + → replaces cubric 3D adaptive system + +This is the "kitchen sink" variant. If bwing_alpha and bwing_entropy_shift +each show gains, this should stack them. + +## Changes from X-WING baseline +1. NGRAM_EVAL_ALPHA_MIN: 0.20 → 0.05 +2. NGRAM_EVAL_ALPHA_MAX: 0.75 → 0.60 +3. Alpha CLIP: 0.75 → 0.95 +4. Per-order entropy center shift +5. Fixed order multipliers replacing cubric 3D +6. Order 4 mult: 0.45 → 0.97 (big change) +7. Order 2 mult: 0.45 → 0.30 + +## Risk +Removing cubric 3D loses per-entropy-bin adaptation. But their fixed mults +work at 0.295 BPB so the risk is bounded. + +## Expected impact +Should approach their 0.295 while keeping our better base model (~1.12 vs 1.14). +Target: sub-0.30 BPB. diff --git a/experiments/B_wing/bwing_full_port/run.sh b/experiments/B_wing/bwing_full_port/run.sh new file mode 100755 index 000000000..0d9cf56f2 --- /dev/null +++ b/experiments/B_wing/bwing_full_port/run.sh @@ -0,0 +1,56 @@ +#!/bin/bash +set -euo pipefail +# B-WING FULL PORT: All PR #809 n-gram innovations on our X-WING base +# Changes: alpha 0.05-0.60 clip=0.95, entropy shift, fixed order mults (no cubric) + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-1337}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +echo "============================================" +echo " B-WING FULL PORT — #809 N-gram Techniques" +echo " Seed: ${SEED}" +echo " Fixed order mults (no cubric)" +echo " Complementary training: alpha=0.5" +echo " Eval alpha: 0.05-0.60 clip=0.95 + entropy shift | Orders: 2-9" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.05 \ +NGRAM_EVAL_ALPHA_MAX=0.60 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=300 \ +CUBRIC_CADENCE=0 \ +NGRAM_ENTROPY_SHIFT=1 \ +NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/bwing_fullport_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/experiments/B_wing/bwing_full_port/train_gpt.py b/experiments/B_wing/bwing_full_port/train_gpt.py new file mode 100644 index 000000000..fadf6073d --- /dev/null +++ b/experiments/B_wing/bwing_full_port/train_gpt.py @@ -0,0 +1,2138 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + ngram_entropy_shift = bool(int(os.environ.get("NGRAM_ENTROPY_SHIFT", "0"))) # per-order center shift + ngram_order_mults_str = os.environ.get("NGRAM_ORDER_MULTS", "") # fixed per-order multipliers (comma-sep) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + # Parse fixed per-order multipliers (PR #809 style) + _fixed_order_mults = None + if args.ngram_order_mults_str: + _fixed_order_mults = np.array([float(x) for x in args.ngram_order_mults_str.split(",")], dtype=np.float64) + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Mix where n-gram matched (PR #809 style or cubric 3D fallback) + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + # Per-order entropy center shift (PR #809) + if adaptive and args.ngram_entropy_shift: + matched_ords = _ng_ord[m_idx].astype(np.float64) + shifted_centers = ent_center - 0.25 * (matched_ords - float(min_order)) + shifted_sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy[m_idx] - shifted_centers))) + per_token_alpha[m_idx] = alpha_min + (alpha_max - alpha_min) * shifted_sig + if _fixed_order_mults is not None: + # PR #809 fixed order multipliers (replaces cubric) + a = per_token_alpha[m_idx].copy() + mult_indices = _ng_ord[m_idx] - min_order + mult_indices = np.clip(mult_indices, 0, len(_fixed_order_mults) - 1) + a *= _fixed_order_mults[mult_indices] + np.clip(a, 0.0, 0.95, out=a) + elif _con: + a = per_token_alpha[m_idx].copy() + m_ent_bins = _ent_bins[m_idx] + m_cnt_bins = np.digitize(_ng_ctx_count[m_idx], _CNT_EDGES).astype(np.int32) + for n in range(min_order, max_order + 1): + om = _ng_ord[m_idx] == n + if not om.any(): + continue + for eb in range(_NUM_ENT_BINS): + for cb in range(_NUM_CNT_BINS): + cell = eb * _NUM_CNT_BINS + cb + mask_ecb = om & (m_ent_bins == eb) & (m_cnt_bins == cb) + if mask_ecb.any(): + _c_hits[n][cell] += int(mask_ecb.sum()) + _c_beats[n][cell] += int((p_ng[m_idx[mask_ecb]] > seg_model_p[m_idx[mask_ecb]]).sum()) + a[mask_ecb] *= _c_alpha_mult[n][cell] + np.clip(a, 0.0, 0.95, out=a) + else: + a = per_token_alpha[m_idx] + seg_model_p[m_idx] = (1.0 - a) * seg_model_p[m_idx] + a * p_ng[m_idx] + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/experiments/B_wing/bwing_full_port/train_seed1337.log b/experiments/B_wing/bwing_full_port/train_seed1337.log new file mode 100644 index 000000000..0b4a07a5e --- /dev/null +++ b/experiments/B_wing/bwing_full_port/train_seed1337.log @@ -0,0 +1,104 @@ +============================================ + B-WING FULL PORT — #809 N-gram Techniques + Seed: 1337 + Fixed order mults (no cubric) + Complementary training: alpha=0.5 + Eval alpha: 0.05-0.60 clip=0.95 + entropy shift | Orders: 2-9 +============================================ +W0326 05:38:58.867000 1640 torch/distributed/run.py:803] +W0326 05:38:58.867000 1640 torch/distributed/run.py:803] ***************************************** +W0326 05:38:58.867000 1640 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0326 05:38:58.867000 1640 torch/distributed/run.py:803] ***************************************** +logs/b93ddcc1-5257-48ca-9542-081180067ac8.txt +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +complementary_training:alpha=0.5 +model_params:26928220 +f1_corr:rank=0 params=0 est_int6_bytes~0 +mlp_act:leaky_relu_sq mlp_leaky_slope:0.5 +XSA:last_4 world_size:8 grad_accum_steps:1 +num_heads:8 num_kv_heads:4 embed_lr:0.035 matrix_lr:0.025 +train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 +compile:enabled=1 fullgraph=0 +seed:1337 +ngram_eval:order=9 alpha=0.3 min_count=2 buckets=8388608 +warmup_step:1/20 +warmup_step:2/20 +warmup_step:3/20 +warmup_step:4/20 +warmup_step:5/20 +warmup_step:6/20 +warmup_step:7/20 +warmup_step:8/20 +warmup_step:9/20 +warmup_step:10/20 +warmup_step:11/20 +warmup_step:12/20 +warmup_step:13/20 +warmup_step:14/20 +warmup_step:15/20 +warmup_step:16/20 +warmup_step:17/20 +warmup_step:18/20 +warmup_step:19/20 +warmup_step:20/20 +step:0/20000 val_loss:6.9317 val_bpb:4.1054 train_time:0ms step_avg:0.02ms +step:1/20000 train_loss:6.9343 train_time:149ms step_avg:149.31ms +step:2/20000 train_loss:8.6212 train_time:232ms step_avg:115.92ms +step:3/20000 train_loss:7.8208 train_time:318ms step_avg:105.93ms +step:4/20000 train_loss:7.1066 train_time:404ms step_avg:100.89ms +step:5/20000 train_loss:6.8530 train_time:489ms step_avg:97.86ms +step:6/20000 train_loss:6.7961 train_time:575ms step_avg:95.83ms +step:7/20000 train_loss:6.6784 train_time:660ms step_avg:94.31ms +step:8/20000 train_loss:6.5596 train_time:746ms step_avg:93.25ms +step:9/20000 train_loss:6.2554 train_time:833ms step_avg:92.52ms +step:10/20000 train_loss:5.9365 train_time:918ms step_avg:91.82ms +step:1000/20000 train_loss:2.2352 train_time:87900ms step_avg:87.90ms +step:2000/20000 train_loss:2.0277 train_time:175924ms step_avg:87.96ms +step:3000/20000 train_loss:2.1245 train_time:263953ms step_avg:87.98ms +step:4000/20000 train_loss:1.9353 train_time:351962ms step_avg:87.99ms +step:5000/20000 train_loss:2.0680 train_time:439941ms step_avg:87.99ms +late_qat:enabled step:5070 scale:0.4999 +step:6000/20000 train_loss:1.9024 train_time:527953ms step_avg:87.99ms +swa:start step:6200 +step:6817/20000 val_loss:1.9221 val_bpb:1.1384 train_time:600020ms step_avg:88.02ms +stopping_early: wallclock_cap train_time:600020ms step:6817/20000 +peak memory allocated: 20677 MiB reserved: 20718 MiB +gptq:calibrating with training data... +gptq:calibrated 68 layers in 3.7s +ema:applying EMA weights +DIAGNOSTIC post_ema val_loss:1.9205 val_bpb:1.1374 eval_time:2027ms +Serialized model: 106047497 bytes +Code size: 106155 bytes +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +Serialized model int6+zstd: 15991916 bytes +Total submission size int6+zstd: 16098071 bytes +Total submission size int8+zlib: 16098071 bytes +final_int6_roundtrip val_loss:1.9301 val_bpb:1.1431 eval_time:37099ms +final_int6_roundtrip_exact val_loss:1.93013868 val_bpb:1.14313685 +final_int6_sliding_window val_loss:1.8901 val_bpb:1.1194 stride:64 eval_time:96435ms +final_int6_sliding_window_exact val_loss:1.89013592 val_bpb:1.11944792 +final_int8_zlib_roundtrip_exact val_loss:1.89013592 val_bpb:1.11944792 +ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True +ngram_eval:chunk [1/60] bpb=1.130307 t=15s +ngram_eval:chunk [2/60] bpb=1.211256 t=18s +ngram_eval:chunk [3/60] bpb=1.235629 t=21s +ngram_eval:chunk [11/60] bpb=1.149570 t=43s +ngram_eval:chunk [21/60] bpb=0.876947 t=70s +ngram_eval:chunk [31/60] bpb=0.694595 t=96s +ngram_eval:chunk [41/60] bpb=0.575851 t=121s +ngram_eval:chunk [51/60] bpb=0.497954 t=146s +ngram_eval:chunk [60/60] bpb=0.450898 t=178s +final_int6_sliding_window_ngram9 val_loss:0.7618 val_bpb:0.4512 eval_time:178896ms +final_int6_sliding_window_ngram9_exact val_loss:0.76181150 val_bpb:0.45118888 +============================================ + DONE +============================================ diff --git a/experiments/pod_launch.sh b/experiments/pod_launch.sh new file mode 100755 index 000000000..c8e7221d2 --- /dev/null +++ b/experiments/pod_launch.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -euo pipefail +# POD LAUNCH — one command to rule them all +# Usage: curl -sL | bash -s [experiment_script] +# or: bash experiments/pod_launch.sh experiments/A_wing/purple/run.sh +# +# Handles: git clone/checkout, env setup, then runs your experiment. + +REPO_URL="https://github.com/newjordan/parameter-golf-1.git" +BRANCH="${BRANCH:-submission/xwing-cubric3d}" +WORKSPACE="/workspace/parameter-golf-lab" +REMOTE_NAME="fork1" +EXPERIMENT="${1:-}" + +echo "============================================" +echo " POD LAUNCH — Auto Setup + Run" +echo " Branch: ${BRANCH}" +echo " Experiment: ${EXPERIMENT:-}" +echo "============================================" + +# --- Step 1: Get the repo --- +if [ -d "${WORKSPACE}/.git" ]; then + echo "[1/3] Repo exists, force-syncing to ${BRANCH}..." + cd "${WORKSPACE}" + # Ensure private remote exists + git remote get-url "${REMOTE_NAME}" &>/dev/null || git remote add "${REMOTE_NAME}" "${REPO_URL}" + git fetch "${REMOTE_NAME}" "${BRANCH}" --quiet + git checkout -B "${BRANCH}" "${REMOTE_NAME}/${BRANCH}" --force + git clean -fd --quiet +else + echo "[1/3] Cloning repo..." + git clone -b "${BRANCH}" "${REPO_URL}" "${WORKSPACE}" + cd "${WORKSPACE}" +fi +echo " HEAD: $(git log --oneline -1)" + +# --- Step 2: Environment setup --- +echo "[2/3] Running setup_runpod.sh..." +bash experiments/setup_runpod.sh + +# --- Step 3: Run experiment --- +if [ -n "${EXPERIMENT}" ]; then + echo "[3/3] Launching: ${EXPERIMENT}" + bash "${EXPERIMENT}" +else + echo "[3/3] No experiment specified. Ready to run manually." + echo " Example: bash experiments/A_wing/purple/run.sh" +fi diff --git a/experiments/setup_runpod.sh b/experiments/setup_runpod.sh new file mode 100755 index 000000000..77cfd07b5 --- /dev/null +++ b/experiments/setup_runpod.sh @@ -0,0 +1,200 @@ +#!/bin/bash +# ------------------------------------------------------------------------------- +# Parameter Golf -- Pod Setup (RunPod / Vast.ai) +# Uses the DEFAULT system Python + PyTorch. No conda. No PYTHONPATH hacks. +# +# Run once after pod starts: +# bash experiments/setup_runpod.sh +# ------------------------------------------------------------------------------- + +set -euo pipefail + +echo "============================================" +echo " Parameter Golf -- Pod Environment Setup" +echo "============================================" + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$REPO_ROOT" + +# ------------------------------------------------------------------------------- +# 1. Verify base environment (system Python + PyTorch must already exist) +# ------------------------------------------------------------------------------- +echo "" +echo "[1/5] Checking base environment..." + +python3 --version || { echo "FATAL: python3 not found"; exit 1; } +python3 -c "import torch; print(f' PyTorch {torch.__version__} CUDA {torch.version.cuda}')" \ + || { echo "FATAL: PyTorch not installed in system Python"; exit 1; } + +GPU_COUNT=$(python3 -c "import torch; print(torch.cuda.device_count())" 2>/dev/null || echo "0") +if [ "$GPU_COUNT" -eq 0 ]; then + echo " WARNING: No GPUs detected" +else + python3 -c " +import torch +for i in range(torch.cuda.device_count()): + p = torch.cuda.get_device_properties(i) + print(f' GPU {i}: {p.name} ({p.total_mem // 1024**3}GB)') +" 2>/dev/null || true +fi + +# ------------------------------------------------------------------------------- +# 2. Core pip packages (into system site-packages, no conda) +# ------------------------------------------------------------------------------- +echo "" +echo "[2/5] Installing pip packages..." + +pip install --upgrade pip -q 2>&1 | tail -1 + +# Install requirements but skip torch (already installed by the pod image) +pip install numpy tqdm huggingface-hub kernels setuptools \ + "typing-extensions==4.15.0" datasets tiktoken sentencepiece -q 2>&1 | tail -1 +echo " Core packages OK" + +# ------------------------------------------------------------------------------- +# 3. zstandard (CRITICAL: prevents artifact size inflation) +# ------------------------------------------------------------------------------- +echo "" +echo "[3/5] zstandard..." + +if python3 -c "import zstandard" 2>/dev/null; then + echo " Already installed" +else + pip install zstandard -q + echo " Installed" +fi +python3 -c "import zstandard; print(f' zstandard {zstandard.__version__}')" + +# ------------------------------------------------------------------------------- +# 4. FlashAttention-3 (into system site-packages -- no PYTHONPATH needed) +# ------------------------------------------------------------------------------- +echo "" +echo "[4/5] FlashAttention-3..." + +install_fa3() { + echo " Attempting FA3 abi3 wheel..." + if pip install --no-cache-dir \ + "https://download.pytorch.org/whl/cu128/flash_attn_3-3.0.0-cp39-abi3-manylinux_2_28_x86_64.whl" \ + 2>&1 | tail -3; then + return 0 + fi + + echo " abi3 wheel failed, trying cu124..." + if pip install --no-cache-dir \ + "https://download.pytorch.org/whl/cu124/flash_attn_3-3.0.0-cp39-abi3-manylinux_2_28_x86_64.whl" \ + 2>&1 | tail -3; then + return 0 + fi + + echo " Wheels failed. Checking for local flash-attention/hopper source..." + if [ -d "${REPO_ROOT}/flash-attention/hopper" ]; then + # Symlink the hopper interface into site-packages so it's always importable + SITE=$(python3 -c "import site; print(site.getsitepackages()[0])") + SRC="${REPO_ROOT}/flash-attention/hopper/flash_attn_interface.py" + if [ -f "$SRC" ]; then + ln -sf "$SRC" "${SITE}/flash_attn_interface.py" + echo " Symlinked flash_attn_interface.py into site-packages" + return 0 + fi + fi + + echo " WARNING: Could not install FA3. Will fall back to PyTorch SDPA." + return 1 +} + +# Check if FA3 already works +if python3 -c "from flash_attn_interface import flash_attn_func; print(' FA3 (flash_attn_interface) OK')" 2>/dev/null; then + : # already good +elif python3 -c "import flash_attn; v=flash_attn.__version__; assert v.startswith('3'); print(f' FA3 v{v} OK')" 2>/dev/null; then + : # flash_attn v3 package works +else + install_fa3 +fi + +# ------------------------------------------------------------------------------- +# 5. Dataset (sp1024) +# ------------------------------------------------------------------------------- +echo "" +echo "[5/5] FineWeb dataset (sp1024)..." + +TRAIN_COUNT=$(ls "${REPO_ROOT}/data/datasets/fineweb10B_sp1024/fineweb_train_"*.bin 2>/dev/null | wc -l) +VAL_COUNT=$(ls "${REPO_ROOT}/data/datasets/fineweb10B_sp1024/fineweb_val_"*.bin 2>/dev/null | wc -l) + +if [ "$TRAIN_COUNT" -ge 10 ]; then + echo " Already have $TRAIN_COUNT train / $VAL_COUNT val shards" +else + echo " Downloading ($TRAIN_COUNT train shards found, need 10+)..." + if command -v huggingface-cli &>/dev/null; then + huggingface-cli download sproos/parameter-golf-tokenizers \ + --include "datasets/fineweb10B_sp1024/*" --local-dir "${REPO_ROOT}/data" + else + python3 -c " +from huggingface_hub import snapshot_download +snapshot_download('sproos/parameter-golf-tokenizers', + allow_patterns='datasets/fineweb10B_sp1024/*', + local_dir='${REPO_ROOT}/data') +" + fi + echo " Downloaded" +fi + +# ------------------------------------------------------------------------------- +# Verification +# ------------------------------------------------------------------------------- +echo "" +echo "============================================" +echo " Verification" +echo "============================================" + +python3 - << 'PYEOF' +import sys, os + +print(f"Python : {sys.version.split()[0]}") +print(f"Executable : {sys.executable}") + +import torch +print(f"PyTorch : {torch.__version__}") +print(f"CUDA avail : {torch.cuda.is_available()}") +print(f"GPUs : {torch.cuda.device_count()}") + +# FA3 +fa = "NOT FOUND" +try: + from flash_attn_interface import flash_attn_func + fa = "flash_attn_interface (FA3 hopper)" +except ImportError: + try: + import flash_attn + v = flash_attn.__version__ + fa = f"flash_attn v{v}" + ("" if v.startswith("3") else " WARNING: not FA3!") + except ImportError: + pass +print(f"FlashAttn : {fa}") + +# zstandard +try: + import zstandard + print(f"zstandard : {zstandard.__version__}") +except ImportError: + print("zstandard : MISSING!") + +# sentencepiece +try: + import sentencepiece + print(f"sentencepiece: OK") +except ImportError: + print("sentencepiece: MISSING!") + +import glob +train = sorted(glob.glob("./data/datasets/fineweb10B_sp1024/fineweb_train_*.bin")) +val = sorted(glob.glob("./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin")) +print(f"Train shards : {len(train)}") +print(f"Val shards : {len(val)}") +PYEOF + +echo "" +echo "============================================" +echo " Setup complete. No conda needed." +echo " Just run your experiment directly:" +echo " bash experiments/A_wing/green_2/run.sh" +echo "============================================" diff --git a/logs/awing_green1_s1337_SOTA_0.3200_20260326.log b/logs/awing_green1_s1337_SOTA_0.3200_20260326.log new file mode 100644 index 000000000..ab47fa8ba --- /dev/null +++ b/logs/awing_green1_s1337_SOTA_0.3200_20260326.log @@ -0,0 +1,103 @@ +============================================ + A-WING GREEN_1 — Oracle Alpha + 9-Prime + Seed: 1337 + Oracle: alpha = sigmoid(8 * log(ngram_p/model_p)) * 0.95 + 9 hash primes, INT6, no cubric +============================================ +W0326 16:51:37.978000 2333 torch/distributed/run.py:803] +W0326 16:51:37.978000 2333 torch/distributed/run.py:803] ***************************************** +W0326 16:51:37.978000 2333 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0326 16:51:37.978000 2333 torch/distributed/run.py:803] ***************************************** +logs/9e0b55e3-b289-49a7-8d84-bc77c070b90a.txt +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +complementary_training:alpha=0.5 +model_params:26928220 +f1_corr:rank=0 params=0 est_int6_bytes~0 +mlp_act:leaky_relu_sq mlp_leaky_slope:0.5 +XSA:last_4 world_size:8 grad_accum_steps:1 +num_heads:8 num_kv_heads:4 embed_lr:0.035 matrix_lr:0.025 +train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 +compile:enabled=1 fullgraph=0 +seed:1337 +ngram_eval:order=9 alpha=0.3 min_count=2 buckets=8388608 +warmup_step:1/20 +warmup_step:2/20 +warmup_step:3/20 +warmup_step:4/20 +warmup_step:5/20 +warmup_step:6/20 +warmup_step:7/20 +warmup_step:8/20 +warmup_step:9/20 +warmup_step:10/20 +warmup_step:11/20 +warmup_step:12/20 +warmup_step:13/20 +warmup_step:14/20 +warmup_step:15/20 +warmup_step:16/20 +warmup_step:17/20 +warmup_step:18/20 +warmup_step:19/20 +warmup_step:20/20 +step:0/20000 val_loss:6.9317 val_bpb:4.1054 train_time:0ms step_avg:0.02ms +step:1/20000 train_loss:6.9343 train_time:149ms step_avg:149.31ms +step:2/20000 train_loss:8.6212 train_time:235ms step_avg:117.59ms +step:3/20000 train_loss:7.8209 train_time:321ms step_avg:106.97ms +step:4/20000 train_loss:7.1066 train_time:408ms step_avg:102.01ms +step:5/20000 train_loss:6.8531 train_time:494ms step_avg:98.71ms +step:6/20000 train_loss:6.7962 train_time:579ms step_avg:96.48ms +step:7/20000 train_loss:6.6786 train_time:664ms step_avg:94.92ms +step:8/20000 train_loss:6.5598 train_time:750ms step_avg:93.71ms +step:9/20000 train_loss:6.2554 train_time:835ms step_avg:92.81ms +step:10/20000 train_loss:5.9363 train_time:922ms step_avg:92.17ms +step:1000/20000 train_loss:2.2320 train_time:87799ms step_avg:87.80ms +step:2000/20000 train_loss:2.0292 train_time:175758ms step_avg:87.88ms +step:3000/20000 train_loss:2.1252 train_time:263741ms step_avg:87.91ms +step:4000/20000 train_loss:1.9367 train_time:351668ms step_avg:87.92ms +step:5000/20000 train_loss:2.0675 train_time:439641ms step_avg:87.93ms +late_qat:enabled step:5075 scale:0.4998 +step:6000/20000 train_loss:1.9055 train_time:527503ms step_avg:87.92ms +swa:start step:6200 +step:6823/20000 val_loss:1.9221 val_bpb:1.1384 train_time:600069ms step_avg:87.95ms +stopping_early: wallclock_cap train_time:600069ms step:6823/20000 +peak memory allocated: 20677 MiB reserved: 20718 MiB +gptq:calibrating with training data... +gptq:calibrated 68 layers in 3.6s +ema:applying EMA weights +DIAGNOSTIC post_ema val_loss:1.9204 val_bpb:1.1374 eval_time:2072ms +Serialized model: 106047497 bytes +Code size: 104216 bytes +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +Serialized model int6+zlib: 16983835 bytes +Total submission size int6+zlib: 17088051 bytes +Total submission size int8+zlib: 17088051 bytes +final_int6_roundtrip val_loss:1.9303 val_bpb:1.1432 eval_time:37064ms +final_int6_roundtrip_exact val_loss:1.93029861 val_bpb:1.14323156 +final_int6_sliding_window val_loss:1.8902 val_bpb:1.1195 stride:64 eval_time:96637ms +final_int6_sliding_window_exact val_loss:1.89018465 val_bpb:1.11947678 +final_int8_zlib_roundtrip_exact val_loss:1.89018465 val_bpb:1.11947678 +ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True +ngram_eval:chunk [1/60] bpb=1.131515 t=15s +ngram_eval:chunk [2/60] bpb=1.073748 t=18s +ngram_eval:chunk [3/60] bpb=1.033717 t=21s +ngram_eval:chunk [11/60] bpb=0.824230 t=43s +ngram_eval:chunk [21/60] bpb=0.609142 t=69s +ngram_eval:chunk [31/60] bpb=0.483174 t=93s +ngram_eval:chunk [41/60] bpb=0.403381 t=117s +ngram_eval:chunk [51/60] bpb=0.351530 t=141s +ngram_eval:chunk [60/60] bpb=0.320066 t=172s +final_int6_sliding_window_ngram9 val_loss:0.5404 val_bpb=0.3200 eval_time:175590ms +final_int6_sliding_window_ngram9_exact val_loss:0.54037046 val_bpb:0.32003867 +============================================ + DONE +============================================ \ No newline at end of file diff --git a/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/README.md b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/README.md new file mode 100644 index 000000000..8cdb451ec --- /dev/null +++ b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/README.md @@ -0,0 +1,113 @@ +# X-WING: 3D Cubric + Complementary Training + +**val_bpb: 0.4820** (3-seed mean, std 0.0002) | **15.58 MB** | 8xH100 SXM + +## Results + +| Seed | val_bpb | Sliding Window BPB | Steps | Train Time | Eval Time | Artifact | +|------|--------:|-------------------:|------:|-----------:|----------:|---------:| +| 1337 | 0.4818 | 1.1196 | 6822 | 600s | 202s | 15.58 MB | +| 300 | 0.4821 | 1.1196 | 6814 | 600s | 204s | 15.66 MB | +| 58 | 0.4821 | 1.1206 | 6822 | 600s | 203s | 15.59 MB | +| **Mean** | **0.4820** | **1.1199** | — | — | — | — | +| **Std** | **0.0002** | — | — | — | — | — | + +## Key Innovations + +Two novel techniques stacked on shared n-gram tables: + +### 1. 3D Cubric Pattern Recognizer (original) + +54 adaptive multipliers across three dimensions: **(order x entropy_bin x count_bin)**. Each cell independently tracks how often the n-gram prediction beats the model for that specific regime and adjusts its alpha multiplier accordingly. + +This captures patterns invisible to 1D (per-order-only) scaling: +- "order 7 at mid-entropy with high count -> trust fully (2.0x)" +- "order 3 at any entropy -> suppress (0.30x)" +- "order 5 at mid-entropy -> trust strongly (1.9x)" + +**Warm-start**: multipliers initialize at proven converged values from prior runs instead of 1.0. Full power from chunk 1 instead of wasting ~30 of 60 chunks converging. + +Warm-start initialization: +``` +o2: 0.45 o3: 0.30 o4: 0.45 o5: 1.88 o6: 2.00 o7: 2.00 o8: 2.00 o9: 2.00 +``` + +Final converged 3D grid (9 cells per order = 3 entropy bins x 3 count bins): +``` + o2: [0.44 0.40 0.30 | 0.45 0.41 0.30 | 0.45 0.45 0.33] + o3: [0.30 0.30 0.30 | 0.30 0.30 0.30 | 0.32 0.30 0.30] + o4: [0.45 0.30 0.30 | 0.66 0.45 0.30 | 0.57 0.72 0.40] + o5: [1.67 0.90 0.91 | 1.94 1.94 0.99 | 2.00 2.00 2.00] + o6: [1.82 0.71 0.96 | 2.00 1.94 1.16 | 2.00 2.00 2.00] + o7: [1.66 0.45 1.05 | 2.00 2.00 1.39 | 2.00 2.00 2.00] + o8: [2.00 0.37 0.75 | 2.00 2.00 1.19 | 2.00 2.00 2.00] + o9: [2.00 0.40 0.52 | 2.00 2.00 0.51 | 2.00 2.00 2.00] +``` + +Key insight: low-order n-grams (2-3) are suppressed across all cells, mid-order (4) has mixed signals, high-order (5-9) are trusted in mid/high-entropy regimes. The cubric learns this automatically through beat-rate tracking. + +### 2. Complementary Training (adapted from PR #803) + +During training, tokens predictable by bigram statistics receive lower loss weight (`COMPLEMENT_ALPHA=0.5`). A GPU-resident bigram count table (`vocab_size x vocab_size`) tracks `P(y|x)` from training data. The per-token loss weight is: + +``` +weight = clamp(1.0 - 0.5 * P_bigram(y|x), min=0.1) +``` + +The model specializes on tokens n-grams can't predict -- novel word choices, long-range dependencies, semantic surprises. This enables higher eval-time n-gram alpha (20-75% vs 5-70%) because the model is deliberately weak where n-grams are strong. + +## Eval Stack + +- **SharedNgramTable**: chunk-based shared tables -- all 8 GPU ranks update with the same tokens, giving every rank the full 62M-token picture +- **Backoff cascade**: orders 2-9, 8M flat hash buckets, greedy (highest matching order wins) +- **Entropy-adaptive alpha**: `alpha_min + (alpha_max - alpha_min) * sigmoid(scale * (H - center))` with `alpha_min=0.20, alpha_max=0.75, center=3.0, scale=2.0` +- **3D Cubric**: per-token alpha scaled by `cubric_mult[order][ent_bin][cnt_bin]` +- **Score-first**: entire chunk scored BEFORE tokens update tables +- **GPTQ int6+zstd**: quantization runs inside training wallclock +- **Sliding window**: stride=64 + +## Ablation (single night of development) + +| Variant | BPB | Delta | Key change | +|---------|----:|------:|------------| +| Podracer III (#782) | 0.9362 | -- | rank-local tables | +| X-WING v1 (#800) | 0.5644 | -0.372 | shared tables + 1D cubric (6 multipliers) | +| X-WING Yellow II | 0.4896 | -0.075 | 3D cubric (54 mults) + complementary training | +| **X-WING (this)** | **0.4818** | **-0.008** | + warm-start cubric initialization | + +## Legality + +1. **Score-first protocol**: entire chunk scored BEFORE its tokens update the n-gram tables. No future-looking. +2. **Complementary training**: uses only training-data bigram statistics. No validation data during training. The bigram table is built from `(x, y)` pairs in the training stream only. +3. **Alpha formula**: `(1-a)*P_neural + a*P_ngram` where a is a fixed function of model entropy x cubric multipliers. Target-independent, committed before scoring each token. +4. **Cubric multipliers**: adapt using beat-rate statistics from already-scored tokens (backward-looking only). Updated every 32 chunks. +5. **Warm-start values**: derived from a prior training run's convergence, not from validation data. Equivalent to a hyperparameter choice. +6. **No oracle selection**: single committed mixture, no min-NLL comparison. +7. **GPTQ calibration**: runs inside training wallclock. +8. **Committed distribution**: proper mixture, all tokens have nonzero probability. + +## Timing Budget + +| Phase | Time | Notes | +|-------|-----:|-------| +| Training | 600s | 6822 steps on 8xH100 SXM | +| GPTQ quantization | ~3.4s | Inside training wallclock | +| N-gram table build + eval | ~202s | Shared tables, 8M buckets, orders 2-9 | +| **Total** | **~802s** | Training + eval | + +## Credits & Acknowledgments + +- **Complementary training concept**: @travispchen (PR #803) -- the insight that reweighting training loss by bigram predictability enables higher eval-time n-gram weight +- **Shared n-gram table insight**: @deanbrr (PR #779) -- all-rank shared tables instead of rank-local +- **N-gram eval cache**: @deanbrr (PR #659) -- flat hash table design +- **Multi-order backoff + adaptive alpha**: @Asukabot0 (PR #727) -- entropy-adaptive blending +- **3D Cubric pattern recognizer + warm-start**: @newjordan (original) +- **Base architecture**: @signalrush (PR #414) + +## Reproduce + +```bash +SEED=1337 NPROC_PER_NODE=8 bash concepts/xwing_yellow_III/run.sh +``` + +8xH100 SXM, 600s training + ~202s eval. diff --git a/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/run.sh b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/run.sh new file mode 100755 index 000000000..caa10be2d --- /dev/null +++ b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/run.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set -euo pipefail +# X-WING YELLOW III: Yellow II + warm-start cubric +# Warm-start: initialize multipliers at proven converged values, not 1.0 +# Full power from chunk 1 instead of wasting 30 chunks converging + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../.." && pwd)" +cd "${REPO_ROOT}" +export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}" + +SEED="${SEED:-2045}" +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" + +echo "============================================" +echo " X-WING YELLOW II — THE MONSTER" +echo " Seed: ${SEED}" +echo " 3D cubric: order × entropy × count (54 mults)" +echo " Complementary training: alpha=0.5" +echo " Eval alpha: 0.20-0.75 | Orders: 2-9" +echo "============================================" + +SEED="$SEED" \ +F1_CORR_RANK=0 \ +DISTILL_ENABLED=0 \ +MLP_ACT=leaky_relu_sq \ +MLP_LEAKY_SLOPE=0.5 \ +XSA_LAST_N=4 \ +BIGRAM_VOCAB_SIZE=1536 \ +TTT_EVAL_ENABLED=0 \ +ROPE_DIMS=24 \ +VAL_LOSS_EVERY=20000 \ +TRAIN_LOG_EVERY=1000 \ +SWA_EVERY=100 \ +COMPLEMENT_ALPHA=0.5 \ +NGRAM_EVAL_ORDER=9 \ +NGRAM_EVAL_MIN_ORDER=2 \ +NGRAM_EVAL_ADAPTIVE=1 \ +NGRAM_EVAL_ALPHA=0.30 \ +NGRAM_EVAL_ALPHA_MIN=0.20 \ +NGRAM_EVAL_ALPHA_MAX=0.75 \ +NGRAM_EVAL_ENTROPY_CENTER=3.0 \ +NGRAM_EVAL_ENTROPY_SCALE=2.0 \ +NGRAM_EVAL_MIN_COUNT=2 \ +NGRAM_EVAL_BUCKETS=8388608 \ +NGRAM_EVAL_MAX_SECONDS=300 \ +CUBRIC_CADENCE="${CUBRIC_CADENCE:-32}" \ +COMPILE_FULLGRAPH=0 \ +torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \ + "${SCRIPT_DIR}/train_gpt.py" \ + 2>&1 | tee "logs/xwing_yellow2_s${SEED}_$(date +%Y%m%d_%H%M%S).log" + +echo "============================================" +echo " DONE" +echo "============================================" diff --git a/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/submission.json b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/submission.json new file mode 100644 index 000000000..0339badfb --- /dev/null +++ b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/submission.json @@ -0,0 +1,41 @@ +{ + "author": "Frosty40", + "github_id": "newjordan", + "name": "X-WING: 3D Cubric + Complementary Training", + "blurb": "Shared n-gram tables + 3D cubric pattern recognizer (54 warm-started adaptive multipliers: order x entropy_bin x count_bin) + complementary training (downweight bigram-predictable tokens). Orders 2-9, alpha 0.20-0.75. 3-seed mean val_bpb=0.4820 (std 0.0002).", + "date": "2026-03-26T05:00:00Z", + "seed_1337": { + "val_bpb": 0.4818, + "val_bpb_exact": 0.48176787, + "sliding_window_bpb": 1.1196, + "sliding_window_bpb_exact": 1.11962844, + "post_ema_bpb": 1.1376, + "steps": 6822, + "train_time_s": 600, + "eval_time_s": 202 + }, + "seed_300": { + "val_bpb": 0.4821, + "val_bpb_exact": 0.48211332, + "sliding_window_bpb": 1.1196, + "sliding_window_bpb_exact": 1.11956294, + "post_ema_bpb": 1.1375, + "steps": 6814, + "train_time_s": 600, + "eval_time_s": 204 + }, + "seed_58": { + "val_bpb": 0.4821, + "val_bpb_exact": 0.48207518, + "sliding_window_bpb": 1.1206, + "sliding_window_bpb_exact": 1.12060881, + "post_ema_bpb": 1.1386, + "steps": 6822, + "train_time_s": 600, + "eval_time_s": 203 + }, + "val_bpb": 0.4820, + "bytes_total": 15581439, + "bytes_code": 104697, + "hardware": "8xH100 SXM" +} diff --git a/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_gpt.py b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_gpt.py new file mode 100644 index 000000000..090eb575c --- /dev/null +++ b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_gpt.py @@ -0,0 +1,2118 @@ +from __future__ import annotations +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP +try: + from flash_attn_interface import flash_attn_func as flash_attn_3_func +except ImportError: + def flash_attn_3_func(q, k, v, causal=False): + # q: (B, T, Hq, D), k/v: (B, T, Hkv, D) — expand KV for GQA + q2 = q.transpose(1, 2) # (B, Hq, T, D) + k2 = k.transpose(1, 2) # (B, Hkv, T, D) + v2 = v.transpose(1, 2) + if k2.size(1) != q2.size(1): + rep = q2.size(1) // k2.size(1) + k2 = k2.repeat_interleave(rep, dim=1) + v2 = v2.repeat_interleave(rep, dim=1) + out = torch.nn.functional.scaled_dot_product_attention(q2, k2, v2, is_causal=causal) + return out.transpose(1, 2) +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 1337)) + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 4000)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 500)) + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3500)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + eval_seq_len = int(os.environ.get("EVAL_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 11)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + mlp_act = os.environ.get("MLP_ACT", "relu_sq").lower() + mlp_leaky_slope = float(os.environ.get("MLP_LEAKY_SLOPE", 0.5)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.035)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.025)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.025)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 0)) + mtp_loss_weight = float(os.environ.get("MTP_LOSS_WEIGHT", 0.2)) + muon_beta2 = float(os.environ.get("MUON_BETA2", 0.95)) + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_every = int(os.environ.get("SWA_EVERY", 50)) # tighter: collect more recent checkpoints + muon_wd = float(os.environ.get("MUON_WD", 0.04)) + adam_wd = float(os.environ.get("ADAM_WD", 0.04)) + qat_enabled = bool(int(os.environ.get("QAT_ENABLED", "0"))) + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 2048)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + xsa_last_n = int(os.environ.get("XSA_LAST_N", 11)) # XSA on ALL 11 layers + rope_dims = int(os.environ.get("ROPE_DIMS", 16)) + ln_scale = bool(int(os.environ.get("LN_SCALE", "1"))) + dtg_enabled = bool(int(os.environ.get("DTG_ENABLED", "0"))) + late_qat_threshold = float(os.environ.get("LATE_QAT_THRESHOLD", 0.5)) + ve_enabled = bool(int(os.environ.get("VE_ENABLED", "1"))) + ve_dim = int(os.environ.get("VE_DIM", 128)) + ve_layers = os.environ.get("VE_LAYERS", "9,10") + # F1 capacity add-on: low-rank correction head (active at inference). + # Approx extra params ~= rank * (model_dim + vocab_size). + f1_corr_rank = int(os.environ.get("F1_CORR_RANK", 0)) + f1_corr_scale_init = float(os.environ.get("F1_CORR_SCALE_INIT", 0.10)) + # Post-train self-distillation: EMA teacher -> student. + distill_enabled = bool(int(os.environ.get("DISTILL_ENABLED", "0"))) + distill_steps = int(os.environ.get("DISTILL_STEPS", 24)) + distill_lr_factor = float(os.environ.get("DISTILL_LR_FACTOR", 0.02)) + distill_temperature = float(os.environ.get("DISTILL_TEMPERATURE", 1.5)) + distill_alpha = float(os.environ.get("DISTILL_ALPHA", 0.60)) + distill_kl_clip = float(os.environ.get("DISTILL_KL_CLIP", 10.0)) + # Optional legal score-first hashed n-gram interpolation at eval time. + # Multi-order backoff (2..max_order) with entropy-adaptive alpha. + # Alpha depends only on model entropy (no target/label access). + ngram_eval_order = int(os.environ.get("NGRAM_EVAL_ORDER", 0)) # 0=off, max order for backoff + ngram_eval_min_order = int(os.environ.get("NGRAM_EVAL_MIN_ORDER", 2)) # min order for backoff + ngram_eval_alpha = float(os.environ.get("NGRAM_EVAL_ALPHA", 0.30)) # base alpha (or fixed if adaptive off) + ngram_eval_adaptive = bool(int(os.environ.get("NGRAM_EVAL_ADAPTIVE", "1"))) # entropy-adaptive alpha + ngram_eval_alpha_min = float(os.environ.get("NGRAM_EVAL_ALPHA_MIN", 0.05)) # alpha floor (confident model) + ngram_eval_alpha_max = float(os.environ.get("NGRAM_EVAL_ALPHA_MAX", 0.60)) # alpha ceiling (uncertain model) + ngram_eval_entropy_center = float(os.environ.get("NGRAM_EVAL_ENTROPY_CENTER", 4.0)) # sigmoid center + ngram_eval_entropy_scale = float(os.environ.get("NGRAM_EVAL_ENTROPY_SCALE", 2.0)) # sigmoid steepness + ngram_eval_min_count = int(os.environ.get("NGRAM_EVAL_MIN_COUNT", 2)) + ngram_eval_buckets = int(os.environ.get("NGRAM_EVAL_BUCKETS", 4_194_304)) + ngram_eval_max_seconds = float(os.environ.get("NGRAM_EVAL_MAX_SECONDS", 0.0)) + cubric_cadence = int(os.environ.get("CUBRIC_CADENCE", 0)) + compile_enabled = bool(int(os.environ.get("COMPILE_ENABLED", "1"))) + compile_fullgraph = bool(int(os.environ.get("COMPILE_FULLGRAPH", "1"))) +def maybe_torch_compile(obj, args: Hyperparameters): + if not args.compile_enabled: + return obj + return torch.compile(obj, dynamic=False, fullgraph=args.compile_fullgraph) +class TrainNgramTracker: + """Complementary training: track bigram stats, downweight tokens n-grams can predict.""" + def __init__(self, vocab_size: int, device: torch.device, complement_alpha: float = 0.5): + self.V = vocab_size + self.alpha = complement_alpha + self.bi_counts = torch.zeros(vocab_size, vocab_size, device=device, dtype=torch.float32) + self.bi_totals = torch.zeros(vocab_size, device=device, dtype=torch.float32) + @torch.no_grad() + def update(self, x: Tensor, y: Tensor): + xf = x.reshape(-1) + yf = y.reshape(-1) + ones = torch.ones(xf.numel(), device=xf.device, dtype=torch.float32) + self.bi_counts.reshape(-1).scatter_add_(0, xf * self.V + yf, ones) + self.bi_totals.scatter_add_(0, xf, ones) + def get_weights(self, x: Tensor, y: Tensor) -> Tensor: + xf = x.reshape(-1) + yf = y.reshape(-1) + total = self.bi_totals[xf] + count = self.bi_counts.reshape(-1)[xf * self.V + yf] + ngram_prob = count / (total + 1) + return (1.0 - self.alpha * ngram_prob).clamp(min=0.1) +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("▁"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + seq_len = eval_seq_len or args.train_seq_len + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,dtg_gate,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) +def keep_float_tensor(name: str, t: Tensor, passthrough_orig_dtypes: dict[str, str]) -> Tensor: + if any(pattern in name for pattern in INT8_KEEP_FLOAT_FP32_NAME_PATTERNS): + return t.float().contiguous() + if t.dtype in {torch.float32, torch.bfloat16}: + passthrough_orig_dtypes[name] = str(t.dtype).removeprefix("torch.") + return t.to(dtype=INT8_KEEP_FLOAT_STORE_DTYPE).contiguous() + return t +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale +def quantize_state_dict_int8(state_dict: dict[str, Tensor]): + quantized: dict[str, Tensor] = {} + scales: dict[str, Tensor] = {} + dtypes: dict[str, str] = {} + passthrough: dict[str, Tensor] = {} + passthrough_orig_dtypes: dict[str, str] = {} + qmeta: dict[str, dict[str, object]] = {} + stats = dict.fromkeys( + ("param_count", "num_tensors", "num_float_tensors", "num_nonfloat_tensors", "baseline_tensor_bytes", "int8_payload_bytes"), + 0, + ) + for name, tensor in state_dict.items(): + t = tensor.detach().to("cpu").contiguous() + stats["param_count"] += int(t.numel()) + stats["num_tensors"] += 1 + stats["baseline_tensor_bytes"] += tensor_nbytes(t) + if not t.is_floating_point(): + stats["num_nonfloat_tensors"] += 1 + passthrough[name] = t + stats["int8_payload_bytes"] += tensor_nbytes(t) + continue + if t.numel() <= INT8_KEEP_FLOAT_MAX_NUMEL: + kept = keep_float_tensor(name, t, passthrough_orig_dtypes) + passthrough[name] = kept + stats["int8_payload_bytes"] += tensor_nbytes(kept) + continue + stats["num_float_tensors"] += 1 + q, s = quantize_float_tensor(t) + if s.ndim > 0: + qmeta[name] = {"scheme": "per_row", "axis": 0} + quantized[name] = q + scales[name] = s + dtypes[name] = str(t.dtype).removeprefix("torch.") + stats["int8_payload_bytes"] += tensor_nbytes(q) + tensor_nbytes(s) + obj: dict[str, object] = { + "__quant_format__": "int8_clean_per_row_v1", + "quantized": quantized, + "scales": scales, + "dtypes": dtypes, + "passthrough": passthrough, + } + if qmeta: + obj["qmeta"] = qmeta + if passthrough_orig_dtypes: + obj["passthrough_orig_dtypes"] = passthrough_orig_dtypes + return obj, stats +def dequantize_state_dict_int8(obj: dict[str, object]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + qmeta = obj.get("qmeta", {}) + passthrough_orig_dtypes = obj.get("passthrough_orig_dtypes", {}) + for name, q in obj["quantized"].items(): + dtype = getattr(torch, obj["dtypes"][name]) + s = obj["scales"][name] + if qmeta.get(name, {}).get("scheme") == "per_row" or s.ndim > 0: + s = s.to(dtype=torch.float32) + out[name] = (q.float() * s.view(q.shape[0], *([1] * (q.ndim - 1)))).to(dtype=dtype).contiguous() + else: + scale = float(s.item()) + out[name] = (q.float() * scale).to(dtype=dtype).contiguous() + for name, t in obj["passthrough"].items(): + out_t = t.detach().to("cpu").contiguous() + orig_dtype = passthrough_orig_dtypes.get(name) + if isinstance(orig_dtype, str): + out_t = out_t.to(dtype=getattr(torch, orig_dtype)).contiguous() + out[name] = out_t + return out +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) +class CastedLinear(nn.Linear): + _qat_enabled: bool = False + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + if CastedLinear._qat_enabled and self.training and w.ndim == 2: + with torch.no_grad(): + w32 = self.weight.float() + # Use 99.95th percentile clipping to match GPTQ export quantizer + row_clip = torch.quantile(w32.abs(), 0.9995, dim=1) + scale = (row_clip / 31.0).clamp_min(1.0 / 31.0) + w_q = (torch.clamp(torch.round(w32 / scale[:, None]), -32, 31) * scale[:, None]).to(x.dtype) + w = w + (w_q - w).detach() + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) +class CausalSelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + rope_base: float, + qk_gain_init: float, + ): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 # set by GPT.__init__ for partial RoPE + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=1024) + self.use_xsa = False # set by GPT.__init__ for deep layers only + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + """Efficient XSA: subtract self-value projection via GQA-aware reshape (no repeat_interleave). + y: [B, T, H, D], v: [B, T, Hkv, D]. H must be divisible by Hkv.""" + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) # [B, T, Hkv, group, D] + vn = F.normalize(v, dim=-1).unsqueeze(-2) # [B, T, Hkv, 1, D] — broadcast ready + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) +class SmearGate(nn.Module): + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev +class BigramHashEmbedding(nn.Module): + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class ValueEmbedding(nn.Module): + """Reinject token identity into attention values at specific layers. + Each table maps vocab tokens to a low-dim embedding, projected to model_dim.""" + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int, mlp_act: str = "relu_sq", mlp_leaky_slope: float = 0.5): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + self.mlp_act = mlp_act + self.mlp_leaky_slope = mlp_leaky_slope + if self.mlp_act not in {"relu_sq", "leaky_relu_sq"}: + raise ValueError(f"Unsupported MLP_ACT '{self.mlp_act}'. Use 'relu_sq' or 'leaky_relu_sq'.") + def forward(self, x: Tensor) -> Tensor: + x = self.fc(x) + if self.mlp_act == "leaky_relu_sq": + x = F.leaky_relu(x, negative_slope=self.mlp_leaky_slope) + else: + x = F.relu(x) + return self.proj(x.square()) +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + rope_base: float, + qk_gain_init: float, + layer_idx: int = 0, + ln_scale: bool = False, + dtg: bool = False, + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + ): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult, mlp_act=mlp_act, mlp_leaky_slope=mlp_leaky_slope) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + if dtg: + self.dtg_gate = nn.Linear(dim, 1, bias=True) + nn.init.zeros_(self.dtg_gate.weight) + nn.init.constant_(self.dtg_gate.bias, 2.0) + else: + self.dtg_gate = None + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + if self.dtg_gate is not None: + gate = torch.sigmoid(self.dtg_gate(x_in.detach())) + x_out = x_in + gate * (x_out - x_in) + return x_out +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: int, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + mtp_num_heads: int = 0, + mtp_loss_weight: float = 0.1, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + xsa_last_n: int = 0, + rope_dims: int = 0, + ln_scale: bool = False, + dtg: bool = False, + ve_enabled: bool = False, + ve_dim: int = 128, + ve_layers: str = "9,10", + mlp_act: str = "relu_sq", + mlp_leaky_slope: float = 0.5, + f1_corr_rank: int = 0, + f1_corr_scale_init: float = 0.10, + ): + super().__init__() + self._ve_target_dim = num_kv_heads * (model_dim // num_heads) # kv_dim for value projection + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.mtp_num_heads = mtp_num_heads + self.mtp_loss_weight = mtp_loss_weight + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.smear = SmearGate(model_dim) + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.blocks = nn.ModuleList( + [ + Block( + model_dim, + num_heads, + num_kv_heads, + mlp_mult, + rope_base, + qk_gain_init, + layer_idx=i, + ln_scale=ln_scale, + dtg=dtg, + mlp_act=mlp_act, + mlp_leaky_slope=mlp_leaky_slope, + ) + for i in range(num_layers) + ] + ) + if rope_dims > 0: + head_dim = model_dim // num_heads + for block in self.blocks: + block.attn.rope_dims = rope_dims + block.attn.rotary = Rotary(head_dim, base=rope_base, train_seq_len=1024, rope_dims=rope_dims) + self.ve_layer_indices = [int(x) for x in ve_layers.split(",") if x.strip()] if ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(vocab_size, ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() # keep empty for compat + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self.mtp_heads = nn.ModuleList( + [CastedLinear(model_dim, vocab_size, bias=False) for _ in range(mtp_num_heads)] + ) + for head in self.mtp_heads: + head._zero_init = True + # Low-rank correction path for extra capacity under size budget. + self.f1_corr_rank = f1_corr_rank + if f1_corr_rank > 0: + self.f1_corr_in = CastedLinear(model_dim, f1_corr_rank, bias=False) + self.f1_corr_out = CastedLinear(f1_corr_rank, vocab_size, bias=False) + self.f1_corr_out._zero_init = True + self.f1_corr_scale = nn.Parameter(torch.tensor(f1_corr_scale_init, dtype=torch.float32)) + else: + self.f1_corr_in = None + self.f1_corr_out = None + self.f1_corr_scale = None + if xsa_last_n > 0: + for i in range(max(0, num_layers - xsa_last_n), num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + """Get value embedding for a specific layer using shared table + per-layer scale.""" + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x_flat)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + if hasattr(self, '_ngram_tracker') and self._ngram_tracker is not None and self.training: + per_tok_loss = F.cross_entropy(logits.float(), targets, reduction="none") + weights = self._ngram_tracker.get_weights(input_ids, target_ids) + main_loss = (per_tok_loss * weights).mean() + else: + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + if self.training and self.mtp_num_heads > 0 and self.mtp_loss_weight > 0.0: + _, seqlen, dim = x.shape + mtp_loss_sum = x.new_zeros(()) + mtp_loss_count = 0 + for k, mtp_head in enumerate(self.mtp_heads): + valid_t = seqlen - (k + 1) + if valid_t <= 0: + continue + mtp_hidden = x[:, :valid_t, :].reshape(-1, dim) + mtp_targets = target_ids[:, k + 1 :].reshape(-1) + mtp_logits_proj = mtp_head(mtp_hidden) + mtp_logits = self.logit_softcap * torch.tanh(mtp_logits_proj / self.logit_softcap) + mtp_loss_sum = mtp_loss_sum + F.cross_entropy(mtp_logits.float(), mtp_targets, reduction="mean") + mtp_loss_count += 1 + if mtp_loss_count > 0: + main_loss = main_loss + self.mtp_loss_weight * (mtp_loss_sum / mtp_loss_count) + return main_loss + def forward_logits(self, input_ids: Tensor) -> Tensor: + """Return logits (bsz, seq_len, vocab) without computing loss.""" + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + if self.f1_corr_in is not None and self.f1_corr_out is not None and self.f1_corr_scale is not None: + corr_hidden = F.silu(self.f1_corr_in(x)) + corr_proj = self.f1_corr_out(corr_hidden) + logits_proj = logits_proj + self.f1_corr_scale.to(dtype=logits_proj.dtype) * corr_proj + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= 1] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte +def _ngram_bulk_update(val_np, start, end, ctx_tables, full_tables, + min_order, max_order, primes, mask): + """Bulk update n-gram tables with a contiguous range of tokens. + All ranks call this with the SAME token range -> identical tables everywhere.""" + t = val_np[start:end].astype(np.uint64) + n = len(t) + for order in range(min_order, max_order + 1): + if n < order: + continue + ctx_width = order - 1 + ctx_hash = np.zeros(n - order + 1, dtype=np.uint64) + for k in range(ctx_width): + ctx_hash ^= t[k:n - order + 1 + k] * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + tgt = t[order - 1:] + full_key = ((ctx_hash ^ (tgt * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_tables[order] += np.bincount(ctx_key, minlength=len(ctx_tables[order])).astype(np.uint32) + full_tables[order] += np.bincount(full_key, minlength=len(full_tables[order])).astype(np.uint32) + +def eval_val_sliding_hashed_ngram( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + order: int, + alpha: float, + min_count: int, + buckets: int, + max_seconds: float = 0.0, + batch_seqs: int = 128, + eval_seq_len: int | None = None, +) -> tuple[float, float, float]: + """Score-first sliding eval with chunk-based SHARED n-gram tables + cubric. + + Key design: all ranks share identical n-gram tables via bulk chunk updates. + Each chunk's windows are distributed across ranks for scoring, then ALL ranks + update tables with the same contiguous token range. Every rank sees the full + n-gram picture (not 1/world_size like per-segment updates). + + Legal: entire chunk scored before its tokens update the tables. + """ + min_order = max(args.ngram_eval_min_order, 2) + max_order = max(order, min_order) + adaptive = args.ngram_eval_adaptive + alpha_min = args.ngram_eval_alpha_min + alpha_max = args.ngram_eval_alpha_max + ent_center = args.ngram_eval_entropy_center + ent_scale = args.ngram_eval_entropy_scale + + seq_len = eval_seq_len or args.train_seq_len + total_tokens = val_tokens.numel() - 1 + + # Build all windows and total scored tokens + all_window_starts = [ws for ws in range(0, total_tokens, stride) if min(ws + seq_len, total_tokens) - ws >= 1] + total_scored_tokens = 0.0 + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + total_scored_tokens += float(max(wlen - s, 0)) + + # Group windows into chunks by scored position -- all ranks share this grouping + chunk_tokens = int(os.environ.get("NGRAM_CHUNK_TOKENS", "1048576")) # 1M default + num_chunks = (total_tokens + chunk_tokens - 1) // chunk_tokens + chunk_windows: list[list[int]] = [[] for _ in range(num_chunks)] + for ws in all_window_starts: + end = min(ws + seq_len, total_tokens) + wlen = end - ws + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_start = ws + s + ci = min(scored_start // chunk_tokens, num_chunks - 1) + chunk_windows[ci].append(ws) + + val_np = val_tokens.numpy() + ctx_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + full_tables = {n: np.zeros((buckets,), dtype=np.uint32) for n in range(min_order, max_order + 1)} + mask = np.uint64(buckets - 1) + primes = np.array( + [np.uint64(36313), np.uint64(27191), np.uint64(51647), np.uint64(81929), + np.uint64(131071), np.uint64(174763), np.uint64(233017)], + dtype=np.uint64, + ) + + loss_sum = 0.0 + token_count = 0.0 + byte_count = 0.0 + + # Cubric 3D: per (order × entropy_bin × count_bin) adaptive alpha scaling + _NUM_ENT_BINS = 3 # low / mid / high entropy + _NUM_CNT_BINS = 3 # low / mid / high count + _ENT_EDGES = np.array([ent_center - 1.0, ent_center + 1.0]) # [2.0, 4.0] for center=3.0 + _CNT_EDGES = np.array([5.0, 50.0]) # low=<5, mid=5-50, high=>50 context count + _TOTAL_CELLS = _NUM_ENT_BINS * _NUM_CNT_BINS # 9 cells per order = 54 total + _cc = getattr(args, 'cubric_cadence', 0); _con = _cc > 0; _cfired = 0 + if _con: + # Warm-start: proven converged values from 4+ runs (orders 2-7) + # All 9 cells per order get the same warm-start, 3D cubric refines from there + _WARM = {2: 0.45, 3: 0.30, 4: 0.45, 5: 1.88, 6: 2.00, 7: 2.00, 8: 2.00, 9: 2.00} + _c_alpha_mult = {n: [_WARM.get(n, 1.0)] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + base_model.eval() + compiled_logits = maybe_torch_compile(base_model.forward_logits, args) + t0 = time.perf_counter() + deadline = (t0 + max_seconds) if max_seconds > 0.0 else None + cutoff_hit = False + + if rank == 0: + print(f"ngram_eval:chunks={num_chunks} chunk_tokens={chunk_tokens} " + f"windows={len(all_window_starts)} shared_tables=True", flush=True) + + with torch.inference_mode(): + for ci in range(num_chunks): + if deadline is not None and time.perf_counter() >= deadline: + cutoff_hit = True + break + + windows = chunk_windows[ci] + if not windows: + continue + + # Distribute this chunk's windows across ranks + my_s = (len(windows) * rank) // world_size + my_e = (len(windows) * (rank + 1)) // world_size + my_windows = windows[my_s:my_e] + + # --- Phase 1: SCORE this chunk's windows --- + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = compiled_logits(x_batch) + logits_f = logits.float() + nll = F.cross_entropy( + logits_f.reshape(-1, logits_f.size(-1)), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + seg_len = wlen - s + if seg_len <= 0: + continue + + seg_nll = nll[i, s:wlen].to(torch.float64).cpu().numpy() + seg_model_p = np.exp(-seg_nll) + + if adaptive: + log_probs = F.log_softmax(logits_f[i, s:wlen], dim=-1) + probs_a = log_probs.exp() + entropy = -(probs_a * log_probs).sum(dim=-1).cpu().numpy() + sig = 1.0 / (1.0 + np.exp(-ent_scale * (entropy - ent_center))) + per_token_alpha = alpha_min + (alpha_max - alpha_min) * sig + # Bin entropy for 2D cubric: 0=low, 1=mid, 2=high + _ent_bins = np.digitize(entropy, _ENT_EDGES).astype(np.int32) + else: + per_token_alpha = np.full(seg_len, alpha) + _ent_bins = np.ones(seg_len, dtype=np.int32) # all mid + + global_j = np.arange(ws + s + 1, ws + wlen + 1, dtype=np.int64) + p_ng = np.zeros(seg_len, dtype=np.float64) + ng_matched = np.zeros(seg_len, dtype=np.bool_) + _ng_ord = np.zeros(seg_len, dtype=np.int32) + _ng_ctx_count = np.zeros(seg_len, dtype=np.float64) + tgt_np = val_np[global_j].astype(np.uint64) + + for n in range(max_order, min_order - 1, -1): + ctx_width = n - 1 + valid = (global_j >= ctx_width) & (~ng_matched) + if not valid.any(): + continue + v_idx = np.nonzero(valid)[0] + jv = global_j[v_idx] + ctx_hash = np.zeros(len(jv), dtype=np.uint64) + for k in range(ctx_width): + tok = val_np[jv - (ctx_width - k)].astype(np.uint64) + ctx_hash ^= tok * primes[k % len(primes)] + ctx_key = (ctx_hash & mask).astype(np.int64) + full_key = ((ctx_hash ^ (tgt_np[v_idx] * primes[ctx_width % len(primes)])) & mask).astype(np.int64) + ctx_counts = ctx_tables[n][ctx_key].astype(np.float64) + full_counts = full_tables[n][full_key].astype(np.float64) + has_data = ctx_counts >= float(min_count) + if has_data.any(): + p = np.minimum(full_counts, ctx_counts) / np.maximum(ctx_counts, 1.0) + p = np.clip(p, 0.0, 1.0) + hit_idx = v_idx[has_data] + p_ng[hit_idx] = p[has_data] + ng_matched[hit_idx] = True + _ng_ord[hit_idx] = n + _ng_ctx_count[hit_idx] = ctx_counts[has_data] + + # Mix where n-gram matched (cubric 3D: order × entropy_bin × count_bin) + if ng_matched.any(): + m_idx = np.nonzero(ng_matched)[0] + if _con: + a = per_token_alpha[m_idx].copy() + m_ent_bins = _ent_bins[m_idx] + m_cnt_bins = np.digitize(_ng_ctx_count[m_idx], _CNT_EDGES).astype(np.int32) + for n in range(min_order, max_order + 1): + om = _ng_ord[m_idx] == n + if not om.any(): + continue + for eb in range(_NUM_ENT_BINS): + for cb in range(_NUM_CNT_BINS): + cell = eb * _NUM_CNT_BINS + cb + mask_ecb = om & (m_ent_bins == eb) & (m_cnt_bins == cb) + if mask_ecb.any(): + _c_hits[n][cell] += int(mask_ecb.sum()) + _c_beats[n][cell] += int((p_ng[m_idx[mask_ecb]] > seg_model_p[m_idx[mask_ecb]]).sum()) + a[mask_ecb] *= _c_alpha_mult[n][cell] + np.clip(a, 0.0, alpha_max, out=a) + else: + a = per_token_alpha[m_idx] + seg_model_p[m_idx] = (1.0 - a) * seg_model_p[m_idx] + a * p_ng[m_idx] + + seg_nll = -np.log(np.clip(seg_model_p, 1e-12, 1.0)) + loss_sum += float(seg_nll.sum()) + token_count += float(seg_len) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += float(tb.sum().item()) + + # --- Phase 2: SHARED UPDATE -- all ranks update with same chunk tokens --- + chunk_start = ci * chunk_tokens + chunk_end = min((ci + 1) * chunk_tokens, total_tokens) + _ngram_bulk_update(val_np, chunk_start, chunk_end + 1, + ctx_tables, full_tables, min_order, max_order, + primes, mask) + + # Cubric 2D c-step: adapt per (order × entropy_bin) + if _con: + # Collect all (order, ent_bin, cnt_bin) cells with enough data + all_rates = [] + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + all_rates.append(_c_beats[n][cell] / _c_hits[n][cell]) + if len(all_rates) >= 4: + avg_rate = sum(all_rates) / len(all_rates) + for n in range(min_order, max_order + 1): + for cell in range(_TOTAL_CELLS): + if _c_hits[n][cell] >= 8: + rate = _c_beats[n][cell] / _c_hits[n][cell] + if rate > avg_rate + 0.05: + _c_alpha_mult[n][cell] = min(_c_alpha_mult[n][cell] * 1.03, 2.0) + elif rate < avg_rate - 0.05: + _c_alpha_mult[n][cell] = max(_c_alpha_mult[n][cell] * 0.97, 0.3) + _cfired += 1 + if rank == 0 and _cfired % 8 == 0: + parts = [] + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + avg_m = sum(m) / len(m) + parts.append(f"o{n}:avg={avg_m:.2f}") + print(f"cubric3d:step={_cfired} {' '.join(parts)}", flush=True) + _c_hits = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + _c_beats = {n: [0] * _TOTAL_CELLS for n in range(min_order, max_order + 1)} + + # Progress + if rank == 0 and (ci % 10 == 0 or ci == num_chunks - 1 or ci < 3): + elapsed = time.perf_counter() - t0 + cur_bpb = (loss_sum / max(token_count, 1.0)) / math.log(2.0) * (token_count / max(byte_count, 1.0)) if token_count > 0 else 0.0 + print( + f"ngram_eval:chunk [{ci+1}/{num_chunks}] bpb={cur_bpb:.6f} t={elapsed:.0f}s", + flush=True, + ) + + # All-reduce across ranks + _loss = torch.tensor(loss_sum, device=device, dtype=torch.float64) + _toks = torch.tensor(token_count, device=device, dtype=torch.float64) + _bytes = torch.tensor(byte_count, device=device, dtype=torch.float64) + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(_loss, op=dist.ReduceOp.SUM) + dist.all_reduce(_toks, op=dist.ReduceOp.SUM) + dist.all_reduce(_bytes, op=dist.ReduceOp.SUM) + loss_sum = _loss.item() + token_count = _toks.item() + byte_count = _bytes.item() + + coverage = token_count / max(total_scored_tokens, 1.0) + if cutoff_hit: + elapsed = time.perf_counter() - t0 + print( + f"ngram_eval:cutoff max_seconds={max_seconds:.1f} " + f"coverage={coverage*100:.2f}% elapsed={elapsed:.0f}s", + flush=True, + ) + + if _con and rank == 0: + print(f"cubric3d:final c_steps={_cfired} cells={_TOTAL_CELLS}x{max_order-min_order+1}={_TOTAL_CELLS*(max_order-min_order+1)}", flush=True) + for n in range(min_order, max_order + 1): + m = _c_alpha_mult[n] + row = " ".join(f"{m[cell]:.2f}" for cell in range(_TOTAL_CELLS)) + print(f" o{n}: [{row}]", flush=True) + val_loss = loss_sum / max(token_count, 1.0) + val_bpb = val_loss / math.log(2.0) * (token_count / max(byte_count, 1.0)) + base_model.train() + return val_loss, val_bpb, coverage +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if "f1_corr_in" in name or "f1_corr_out" in name: + return "aux" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" +# --------------------------------------------------------------------------- +# GPTQ: Hessian-aware quantization with column-wise error compensation +# --------------------------------------------------------------------------- +def _find_best_row_scales(W: Tensor, clip_range: int = 31) -> Tensor: + """Find optimal per-row scales by searching percentile clipping thresholds.""" + t32 = W.float() + best_s = t32.abs().amax(dim=1) / clip_range + best_s = best_s.clamp_min(1.0 / clip_range) + best_err = torch.full((t32.shape[0],), float('inf')) + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range) + q = torch.clamp(torch.round(t32 / s[:, None]), -clip_range, clip_range) + recon = q * s[:, None] + err = (t32 - recon).pow(2).mean(dim=1) + improved = err < best_err + best_s[improved] = s[improved] + best_err[improved] = err[improved] + return best_s +def gptq_quantize_weight(W: Tensor, H: Tensor, clip_range: int = 31, + block_size: int = 64, percdamp: float = 0.002) -> tuple[Tensor, Tensor]: + """GPTQ: quantize weight matrix W using Hessian H = X^T X for error compensation. + Uses pre-computed per-row scales and column reordering by Hessian diagonal. + Returns (quantized_int8, scale_fp16) in int6 range [-clip_range, clip_range].""" + W = W.float().clone() + rows, cols = W.shape + # Pre-compute optimal per-row scales from the original weight matrix + row_scale = _find_best_row_scales(W, clip_range) + H = H.float().clone() + damp = percdamp * H.diag().mean() + H.diagonal().add_(damp) + # Column reordering: process least-important columns first (ascending H_diag) + perm = torch.argsort(H.diag()) + invperm = torch.argsort(perm) + W = W[:, perm] + H = H[perm][:, perm] + try: + L = torch.linalg.cholesky(H) + Hinv = torch.cholesky_inverse(L) + except torch._C._LinAlgError: + Hinv = torch.diag(1.0 / H.diag().clamp_min(1e-6)) + Q = torch.zeros(rows, cols, dtype=torch.int8) + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros_like(W_block) + for j in range(i2 - i1): + w_col = W_block[:, j] + h_inv_jj = Hinv_block[j, j].clamp_min(1e-8) + # Quantize using pre-computed per-row scales + q_col = torch.clamp(torch.round(w_col / row_scale), -clip_range, clip_range) + deq_col = q_col * row_scale + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - deq_col) / h_inv_jj + Err[:, j] = err + if j + 1 < i2 - i1: + W_block[:, j + 1:] -= err.unsqueeze(1) * Hinv_block[j, j + 1:].unsqueeze(0) + if i2 < cols: + W[:, i2:] -= Err @ Hinv[i1:i2, i2:] + # Undo column reordering + Q = Q[:, invperm] + return Q, row_scale.to(torch.float16) +def gptq_calibrate(model: nn.Module, train_pattern: str, device: torch.device, + n_samples: int = 256, seq_len: int = 2048) -> dict[str, Tensor]: + """Collect Hessian H = X^T X for each linear layer using training data.""" + hessians: dict[str, Tensor] = {} + n_seen: dict[str, int] = {} + hooks = [] + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros(x.shape[1], x.shape[1], device=x.device, dtype=torch.float32) + n_seen[name] = 0 + hessians[name].addmm_(x.t(), x) + n_seen[name] += x.shape[0] + return hook_fn + for name, module in model.named_modules(): + if isinstance(module, (nn.Linear, CastedLinear)): + hooks.append(module.register_forward_hook(make_hook(name))) + stream = TokenStream(train_pattern) + model.eval() + with torch.no_grad(): + for _ in range(n_samples): + tokens = stream.take(seq_len + 1).to(device=device, dtype=torch.int64) + x = tokens[:-1].unsqueeze(0) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + model.forward_logits(x) + for h in hooks: + h.remove() + for name in hessians: + hessians[name] /= max(n_seen[name], 1) + return hessians +def mixed_quantize_int6_gptq(state_dict: dict[str, Tensor], int6_cats: set[str], + hessians: dict[str, Tensor]) -> tuple[dict, dict]: + """Like mixed_quantize_int6 but uses GPTQ for int6 categories when Hessian available.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count, naive_count = 0, 0 + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim == 2: + module_name = name.rsplit(".weight", 1)[0] if name.endswith(".weight") else name + H = hessians.get(module_name) + if H is not None and H.shape[0] == t.shape[1]: + q, s = gptq_quantize_weight(t, H.cpu()) + gptq_count += 1 + else: + q, s = quantize_int6_per_row(t) + naive_count += 1 + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + naive_count += 1 + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + print(f"gptq_quantize: {gptq_count} GPTQ layers, {naive_count} naive layers", flush=True) + return result, meta +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + num_layers_total = max( + (int(k.split(".")[1]) for k in state_dict if k.startswith("blocks.")), + default=0, + ) + 1 + late_k_layers = set(range(num_layers_total - 2, num_layers_total)) + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out +def main() -> None: + global zeropower_via_newtonschulz5 + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if args.compile_enabled: + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + effective_eval_seq_len = args.eval_seq_len if args.eval_seq_len > 0 else args.train_seq_len + val_seq_len = max(args.train_seq_len, effective_eval_seq_len) + val_tokens = load_validation_tokens(args.val_files, val_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + CastedLinear._qat_enabled = args.qat_enabled + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, + mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, + rope_dims=args.rope_dims, + ln_scale=args.ln_scale, + dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, + ve_dim=args.ve_dim, + ve_layers=args.ve_layers, + mlp_act=args.mlp_act, + mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, + f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + # Complementary training: downweight tokens predictable by bigrams + complement_alpha = float(os.environ.get("COMPLEMENT_ALPHA", "0")) + if complement_alpha > 0: + tracker = TrainNgramTracker(args.vocab_size, device, complement_alpha=complement_alpha) + base_model._ngram_tracker = tracker + log0(f"complementary_training:alpha={complement_alpha}") + else: + base_model._ngram_tracker = None + compiled_model = maybe_torch_compile(base_model, args) + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.mtp_num_heads > 0: + matrix_params.extend([p for p in base_model.mtp_heads.parameters() if p.ndim == 2]) + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + matrix_params.append(base_model.f1_corr_in.weight) + matrix_params.append(base_model.f1_corr_out.weight) + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + if base_model.f1_corr_scale is not None: + scalar_params.append(base_model.f1_corr_scale) + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=args.muon_wd, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.adam_wd, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + n_params = sum(p.numel() for p in base_model.parameters()) + f1_corr_params = 0 + if base_model.f1_corr_in is not None and base_model.f1_corr_out is not None: + f1_corr_params = int(base_model.f1_corr_in.weight.numel() + base_model.f1_corr_out.weight.numel()) + est_corr_int6_bytes = 0 + if args.f1_corr_rank > 0: + # int8 payload stores int6 values + per-row fp16 scales. + est_corr_int6_bytes = ( + args.f1_corr_rank * (args.model_dim + args.vocab_size) + + 2 * (args.f1_corr_rank + args.vocab_size) + ) + log0(f"model_params:{n_params}") + log0( + f"f1_corr:rank={args.f1_corr_rank} params={f1_corr_params} " + f"est_int6_bytes~{est_corr_int6_bytes}" + ) + log0(f"mlp_act:{args.mlp_act} mlp_leaky_slope:{args.mlp_leaky_slope}") + log0(f"XSA:last_{args.xsa_last_n} world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads} embed_lr:{token_lr} matrix_lr:{args.matrix_lr}") + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"compile:enabled={int(args.compile_enabled)} fullgraph={int(args.compile_fullgraph)}") + log0(f"seed:{args.seed}") + if args.ngram_eval_order >= 2: + log0( + f"ngram_eval:order={args.ngram_eval_order} alpha={args.ngram_eval_alpha} " + f"min_count={args.ngram_eval_min_count} buckets={args.ngram_eval_buckets}" + ) + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + def lr_mul(step: int, elapsed_ms: float) -> float: + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + warmdown_start = max(args.iterations - args.warmdown_iters, 0) + return max((args.iterations - step) / max(args.warmdown_iters, 1), 0.0) if warmdown_start <= step < args.iterations else 1.0 + step_ms = elapsed_ms / max(step, 1) + warmdown_ms = args.warmdown_iters * step_ms + remaining_ms = max(max_wallclock_ms - elapsed_ms, 0.0) + return remaining_ms / max(warmdown_ms, 1e-9) if remaining_ms <= warmdown_ms else 1.0 + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = 0.997 + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, + model, + rank, + world_size, + device, + grad_accum_steps, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + if args.late_qat_threshold > 0 and scale < args.late_qat_threshold and not CastedLinear._qat_enabled: + CastedLinear._qat_enabled = True + log0(f"late_qat:enabled step:{step} scale:{scale:.4f}") + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + loss.backward() + if base_model._ngram_tracker is not None: + base_model._ngram_tracker.update(x, y) + train_loss /= grad_accum_steps + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + # EMA update + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + if args.swa_enabled and scale < 0.2 and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + # GPTQ calibration: collect Hessians from training data DURING training phase + # (must happen before training ends to comply with eval-time data access rules) + log0("gptq:calibrating with training data...") + t_gptq = time.perf_counter() + gptq_hessians = gptq_calibrate(base_model, args.train_files, device, n_samples=256, seq_len=args.train_seq_len) + log0(f"gptq:calibrated {len(gptq_hessians)} layers in {time.perf_counter()-t_gptq:.1f}s") + if args.distill_enabled and args.distill_steps > 0: + log0( + f"distill:start steps:{args.distill_steps} lr_factor:{args.distill_lr_factor} " + f"temp:{args.distill_temperature} alpha:{args.distill_alpha} kl_clip:{args.distill_kl_clip}" + ) + current_state = base_model.state_dict() + teacher_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + teacher_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads, mtp_loss_weight=args.mtp_loss_weight, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in teacher_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(teacher_model) + teacher_model.load_state_dict(teacher_state, strict=True) + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad_(False) + compiled_teacher_logits = maybe_torch_compile(teacher_model.forward_logits, args) + model.train() + T = args.distill_temperature + alpha = args.distill_alpha + for d_step in range(args.distill_steps): + zero_grad_all() + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * args.distill_lr_factor + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + student_logits = base_model.forward_logits(x) + with torch.no_grad(): + teacher_logits = compiled_teacher_logits(x) + student_log_probs = F.log_softmax(student_logits.float() / T, dim=-1) + teacher_probs = F.softmax(teacher_logits.float() / T, dim=-1) + token_kl = F.kl_div(student_log_probs, teacher_probs, reduction="none").sum(dim=-1) + kl_loss = token_kl.mean() * (T * T) + if args.distill_kl_clip > 0: + kl_loss = torch.clamp(kl_loss, max=args.distill_kl_clip) + ce_loss = F.cross_entropy( + student_logits.reshape(-1, student_logits.size(-1)).float(), + y.reshape(-1), + reduction="mean", + ) + loss = alpha * kl_loss + (1.0 - alpha) * ce_loss + (loss * grad_scale).backward() + if world_size > 1: + for p in base_model.parameters(): + if p.grad is not None: + dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + if (d_step + 1) % 8 == 0 or d_step == 0: + log0( + f"distill:step:{d_step + 1}/{args.distill_steps} " + f"kl:{kl_loss.item():.4f} ce:{ce_loss.item():.4f} total:{loss.item():.4f}" + ) + del teacher_model, compiled_teacher_logits + torch.cuda.empty_cache() + log0("distill:done") + # Apply EMA weights (better than SWA alone per PR#401) + log0("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + torch.cuda.synchronize() + t_diag = time.perf_counter() + diag_val_loss, diag_val_bpb = eval_val( + args, compiled_model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"DIAGNOSTIC post_ema val_loss:{diag_val_loss:.4f} val_bpb:{diag_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_diag):.0f}ms" + ) + full_state_dict = base_model.state_dict() + export_sd = {k: v for k, v in full_state_dict.items() if "mtp_heads" not in k} + excluded_mtp = sum(int(t.numel()) for k, t in full_state_dict.items() if "mtp_heads" in k) + if excluded_mtp > 0: + log0(f"export_excluding_mtp_params:{excluded_mtp}") + if master_process: + torch.save(export_sd, "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + sd_cpu = {k: v.detach().cpu() for k, v in export_sd.items()} + # GPTQ quantization using Hessians collected during training phase (no training data access here) + quant_result, quant_meta = mixed_quantize_int6_gptq(sd_cpu, {"mlp", "attn", "aux"}, gptq_hessians) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) if _COMPRESSOR == "zstd" else zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int6.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = len(quant_blob) + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int6+{_COMPRESSOR}: {quant_file_bytes + code_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + if distributed: + dist.barrier() + with open("final_model.int6.ptz", "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(zstandard.ZstdDecompressor().decompress(quant_blob_disk) if _COMPRESSOR == "zstd" else zlib.decompress(quant_blob_disk)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model = GPT( + vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim, + num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init, + mtp_num_heads=0, mtp_loss_weight=0.0, + bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim, + xsa_last_n=args.xsa_last_n, # must match training model + rope_dims=args.rope_dims, ln_scale=args.ln_scale, dtg=args.dtg_enabled, + ve_enabled=args.ve_enabled, ve_dim=args.ve_dim, ve_layers=args.ve_layers, + mlp_act=args.mlp_act, mlp_leaky_slope=args.mlp_leaky_slope, + f1_corr_rank=args.f1_corr_rank, f1_corr_scale_init=args.f1_corr_scale_init, + ).to(device).bfloat16() + for m in eval_model.modules(): + if isinstance(m, CastedLinear): + m.float() + restore_low_dim_params_to_fp32(eval_model) + eval_model.load_state_dict(deq_state, strict=True) + compiled_eval = maybe_torch_compile(eval_model, args) + torch.cuda.synchronize() + t_qeval = time.perf_counter() + q_val_loss, q_val_bpb = eval_val( + args, compiled_eval, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + eval_seq_len=effective_eval_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int6_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + sw_seq_len = effective_eval_seq_len + if args.eval_stride > 0 and args.eval_stride < sw_seq_len: + torch.cuda.synchronize() + t_slide = time.perf_counter() + sw_val_loss, sw_val_bpb = eval_val_sliding( + args, eval_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, + eval_seq_len=sw_seq_len, + ) + torch.cuda.synchronize() + log0( + f"final_int6_sliding_window val_loss:{sw_val_loss:.4f} val_bpb:{sw_val_bpb:.4f} " + f"stride:{args.eval_stride} eval_time:{1000.0 * (time.perf_counter() - t_slide):.0f}ms" + ) + log0(f"final_int6_sliding_window_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + log0(f"final_int8_zlib_roundtrip_exact val_loss:{sw_val_loss:.8f} val_bpb:{sw_val_bpb:.8f}") + if args.ngram_eval_order >= 2: + if distributed: + dist.barrier() + torch.cuda.synchronize() + t_ng = time.perf_counter() + ng_loss, ng_bpb, ng_coverage = eval_val_sliding_hashed_ngram( + args, + eval_model, + rank, + world_size, + device, + val_tokens, + base_bytes_lut, + has_leading_space_lut, + is_boundary_token_lut, + stride=args.eval_stride, + order=args.ngram_eval_order, + alpha=args.ngram_eval_alpha, + min_count=args.ngram_eval_min_count, + buckets=args.ngram_eval_buckets, + max_seconds=args.ngram_eval_max_seconds, + eval_seq_len=sw_seq_len, + ) + if rank == 0: + torch.cuda.synchronize() + ng_eval_ms = 1000.0 * (time.perf_counter() - t_ng) + if ng_coverage >= 0.999999: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order} val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f}" + ) + else: + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial val_loss:{ng_loss:.4f} " + f"val_bpb:{ng_bpb:.4f} coverage:{ng_coverage:.4f} eval_time:{ng_eval_ms:.0f}ms" + ) + log0( + f"final_int6_sliding_window_ngram{args.ngram_eval_order}_partial_exact " + f"val_loss:{ng_loss:.8f} val_bpb:{ng_bpb:.8f} coverage:{ng_coverage:.8f}" + ) + if distributed: + dist.barrier() + if distributed: + dist.destroy_process_group() +if __name__ == "__main__": + main() diff --git a/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed1337.log b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed1337.log new file mode 100644 index 000000000..b0fb6b721 --- /dev/null +++ b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed1337.log @@ -0,0 +1,120 @@ +============================================ + X-WING YELLOW II — THE MONSTER + Seed: 1337 + 3D cubric: order × entropy × count (54 mults) + Complementary training: alpha=0.5 + Eval alpha: 0.20-0.75 | Orders: 2-9 +============================================ +W0326 04:14:59.751000 80264 torch/distributed/run.py:803] +W0326 04:14:59.751000 80264 torch/distributed/run.py:803] ***************************************** +W0326 04:14:59.751000 80264 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0326 04:14:59.751000 80264 torch/distributed/run.py:803] ***************************************** +logs/e56d845e-02ab-479e-b2ab-f8d3603c41fd.txt +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +complementary_training:alpha=0.5 +model_params:26928220 +f1_corr:rank=0 params=0 est_int6_bytes~0 +mlp_act:leaky_relu_sq mlp_leaky_slope:0.5 +XSA:last_4 world_size:8 grad_accum_steps:1 +num_heads:8 num_kv_heads:4 embed_lr:0.035 matrix_lr:0.025 +train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 +compile:enabled=1 fullgraph=0 +seed:1337 +ngram_eval:order=9 alpha=0.3 min_count=2 buckets=8388608 +warmup_step:1/20 +warmup_step:2/20 +warmup_step:3/20 +warmup_step:4/20 +warmup_step:5/20 +warmup_step:6/20 +warmup_step:7/20 +warmup_step:8/20 +warmup_step:9/20 +warmup_step:10/20 +warmup_step:11/20 +warmup_step:12/20 +warmup_step:13/20 +warmup_step:14/20 +warmup_step:15/20 +warmup_step:16/20 +warmup_step:17/20 +warmup_step:18/20 +warmup_step:19/20 +warmup_step:20/20 +step:0/20000 val_loss:6.9317 val_bpb:4.1054 train_time:0ms step_avg:0.02ms +step:1/20000 train_loss:6.9343 train_time:146ms step_avg:146.05ms +step:2/20000 train_loss:8.6212 train_time:227ms step_avg:113.71ms +step:3/20000 train_loss:7.8209 train_time:313ms step_avg:104.29ms +step:4/20000 train_loss:7.1065 train_time:399ms step_avg:99.63ms +step:5/20000 train_loss:6.8530 train_time:484ms step_avg:96.85ms +step:6/20000 train_loss:6.7961 train_time:570ms step_avg:95.01ms +step:7/20000 train_loss:6.6785 train_time:656ms step_avg:93.66ms +step:8/20000 train_loss:6.5601 train_time:742ms step_avg:92.78ms +step:9/20000 train_loss:6.2554 train_time:827ms step_avg:91.94ms +step:10/20000 train_loss:5.9364 train_time:913ms step_avg:91.35ms +step:1000/20000 train_loss:2.2369 train_time:87837ms step_avg:87.84ms +step:2000/20000 train_loss:2.0293 train_time:175897ms step_avg:87.95ms +step:3000/20000 train_loss:2.1263 train_time:263850ms step_avg:87.95ms +step:4000/20000 train_loss:1.9381 train_time:351794ms step_avg:87.95ms +step:5000/20000 train_loss:2.0669 train_time:439694ms step_avg:87.94ms +late_qat:enabled step:5074 scale:0.4998 +step:6000/20000 train_loss:1.9070 train_time:527586ms step_avg:87.93ms +swa:start step:6200 +step:6822/20000 val_loss:1.9224 val_bpb:1.1386 train_time:600062ms step_avg:87.96ms +stopping_early: wallclock_cap train_time:600062ms step:6822/20000 +peak memory allocated: 20677 MiB reserved: 20718 MiB +gptq:calibrating with training data... +gptq:calibrated 68 layers in 3.4s +ema:applying EMA weights +DIAGNOSTIC post_ema val_loss:1.9208 val_bpb:1.1376 eval_time:2141ms +Serialized model: 106047497 bytes +Code size: 104697 bytes +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +Serialized model int6+zstd: 15476742 bytes +Total submission size int6+zstd: 15581439 bytes +Total submission size int8+zlib: 15581439 bytes +final_int6_roundtrip val_loss:1.9302 val_bpb:1.1432 eval_time:36988ms +final_int6_roundtrip_exact val_loss:1.93020559 val_bpb:1.14317647 +final_int6_sliding_window val_loss:1.8904 val_bpb:1.1196 stride:64 eval_time:96124ms +final_int6_sliding_window_exact val_loss:1.89044071 val_bpb:1.11962844 +final_int8_zlib_roundtrip_exact val_loss:1.89044071 val_bpb:1.11962844 +ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True +ngram_eval:chunk [1/60] bpb=1.132337 t=15s +ngram_eval:chunk [2/60] bpb=1.166917 t=19s +ngram_eval:chunk [3/60] bpb=1.169450 t=23s +cubric3d:step=8 o2:avg=0.42 o3:avg=0.30 o4:avg=0.45 o5:avg=1.91 o6:avg=1.94 o7:avg=1.90 o8:avg=1.92 o9:avg=1.95 +ngram_eval:chunk [11/60] bpb=1.045194 t=51s +cubric3d:step=16 o2:avg=0.39 o3:avg=0.30 o4:avg=0.45 o5:avg=1.80 o6:avg=1.78 o7:avg=1.79 o8:avg=1.82 o9:avg=1.87 +ngram_eval:chunk [21/60] bpb=0.812261 t=83s +cubric3d:step=24 o2:avg=0.39 o3:avg=0.30 o4:avg=0.46 o5:avg=1.70 o6:avg=1.69 o7:avg=1.69 o8:avg=1.72 o9:avg=1.74 +ngram_eval:chunk [31/60] bpb=0.667249 t=111s +cubric3d:step=32 o2:avg=0.39 o3:avg=0.30 o4:avg=0.46 o5:avg=1.60 o6:avg=1.64 o7:avg=1.66 o8:avg=1.64 o9:avg=1.65 +cubric3d:step=40 o2:avg=0.39 o3:avg=0.30 o4:avg=0.46 o5:avg=1.59 o6:avg=1.63 o7:avg=1.63 o8:avg=1.60 o9:avg=1.58 +ngram_eval:chunk [41/60] bpb=0.574788 t=137s +cubric3d:step=48 o2:avg=0.39 o3:avg=0.30 o4:avg=0.46 o5:avg=1.59 o6:avg=1.62 o7:avg=1.63 o8:avg=1.60 o9:avg=1.56 +ngram_eval:chunk [51/60] bpb=0.515862 t=164s +cubric3d:step=56 o2:avg=0.39 o3:avg=0.30 o4:avg=0.46 o5:avg=1.59 o6:avg=1.62 o7:avg=1.62 o8:avg=1.60 o9:avg=1.51 +ngram_eval:chunk [60/60] bpb=0.481395 t=197s +cubric3d:final c_steps=60 cells=9x8=72 + o2: [0.44 0.40 0.30 0.45 0.41 0.30 0.45 0.45 0.33] + o3: [0.30 0.30 0.30 0.30 0.30 0.30 0.32 0.30 0.30] + o4: [0.45 0.30 0.30 0.66 0.45 0.30 0.57 0.72 0.40] + o5: [1.67 0.90 0.91 1.94 1.94 0.99 2.00 2.00 2.00] + o6: [1.82 0.71 0.96 2.00 1.94 1.16 2.00 2.00 2.00] + o7: [1.66 0.45 1.05 2.00 2.00 1.39 2.00 2.00 2.00] + o8: [2.00 0.37 0.75 2.00 2.00 1.19 2.00 2.00 2.00] + o9: [2.00 0.40 0.52 2.00 2.00 0.51 2.00 2.00 2.00] +final_int6_sliding_window_ngram9 val_loss:0.8134 val_bpb:0.4818 eval_time:201850ms +final_int6_sliding_window_ngram9_exact val_loss:0.81344271 val_bpb:0.48176787 +============================================ + DONE +============================================ diff --git a/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed1337_yellowII_reference.log b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed1337_yellowII_reference.log new file mode 100644 index 000000000..9b0cd56f2 --- /dev/null +++ b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed1337_yellowII_reference.log @@ -0,0 +1,45 @@ +============================================ + REFERENCE: Yellow II (no warm-start) seed 1337 = 0.4896 BPB + This is NOT the submission variant. Included for ablation reference. +============================================ +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +Serialized model int6+zstd: 15632349 bytes +Total submission size int6+zstd: 15736871 bytes +Total submission size int8+zlib: 15736871 bytes +final_int6_roundtrip val_loss:1.9306 val_bpb:1.1434 eval_time:6856ms +final_int6_roundtrip_exact val_loss:1.93055044 val_bpb:1.14338071 +final_int6_sliding_window val_loss:1.8905 val_bpb:1.1197 stride:64 eval_time:74718ms +final_int6_sliding_window_exact val_loss:1.89054804 val_bpb:1.11969200 +final_int8_zlib_roundtrip_exact val_loss:1.89054804 val_bpb:1.11969200 +ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True +ngram_eval:chunk [1/60] bpb=1.129854 t=4s +ngram_eval:chunk [2/60] bpb=1.188448 t=8s +ngram_eval:chunk [3/60] bpb=1.184841 t=11s +cubric3d:step=8 o2:avg=0.93 o3:avg=0.85 o4:avg=0.98 o5:avg=1.03 o6:avg=1.05 o7:avg=1.04 o8:avg=1.04 o9:avg=1.07 +ngram_eval:chunk [11/60] bpb=1.029792 t=39s +cubric3d:step=16 o2:avg=0.87 o3:avg=0.69 o4:avg=0.97 o5:avg=1.11 o6:avg=1.13 o7:avg=1.13 o8:avg=1.13 o9:avg=1.17 +ngram_eval:chunk [21/60] bpb=0.806964 t=70s +cubric3d:step=24 o2:avg=0.86 o3:avg=0.62 o4:avg=0.96 o5:avg=1.23 o6:avg=1.27 o7:avg=1.25 o8:avg=1.27 o9:avg=1.29 +ngram_eval:chunk [31/60] bpb=0.667829 t=99s +cubric3d:step=32 o2:avg=0.86 o3:avg=0.62 o4:avg=0.94 o5:avg=1.25 o6:avg=1.33 o7:avg=1.31 o8:avg=1.28 o9:avg=1.31 +cubric3d:step=40 o2:avg=0.86 o3:avg=0.62 o4:avg=0.94 o5:avg=1.25 o6:avg=1.33 o7:avg=1.29 o8:avg=1.26 o9:avg=1.28 +ngram_eval:chunk [41/60] bpb=0.579080 t=126s +cubric3d:step=48 o2:avg=0.86 o3:avg=0.62 o4:avg=0.94 o5:avg=1.25 o6:avg=1.33 o7:avg=1.29 o8:avg=1.26 o9:avg=1.26 +ngram_eval:chunk [51/60] bpb=0.522630 t=153s +cubric3d:step=56 o2:avg=0.86 o3:avg=0.62 o4:avg=0.94 o5:avg=1.25 o6:avg=1.33 o7:avg=1.29 o8:avg=1.29 o9:avg=1.28 +ngram_eval:chunk [60/60] bpb=0.488889 t=176s +cubric3d:final c_steps=60 cells=9x8=72 + o2: [0.97 0.91 0.60 1.00 0.91 0.61 1.00 1.00 0.72] + o3: [0.65 0.50 0.47 0.72 0.58 0.53 0.71 0.69 0.72] + o4: [0.97 0.47 0.48 1.47 0.86 0.53 1.23 1.60 0.83] + o5: [0.97 0.47 0.50 2.00 1.70 0.53 1.80 1.86 1.38] + o6: [1.02 0.39 0.48 2.00 2.00 0.63 2.00 2.00 1.43] + o7: [0.88 0.30 0.54 2.00 2.00 0.65 2.00 2.00 1.27] + o8: [1.29 0.30 0.36 2.00 2.00 0.69 2.00 2.00 1.03] + o9: [1.41 0.30 0.34 2.00 2.00 0.30 2.00 2.00 1.30] +final_int6_sliding_window_ngram9 val_loss:0.8267 val_bpb:0.4896 eval_time:182179ms +final_int6_sliding_window_ngram9_exact val_loss:0.82666522 val_bpb:0.48959900 +============================================ + DONE +============================================ diff --git a/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed300.log b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed300.log new file mode 100644 index 000000000..59ecd1767 --- /dev/null +++ b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed300.log @@ -0,0 +1,120 @@ +============================================ + X-WING YELLOW II — THE MONSTER + Seed: 300 + 3D cubric: order × entropy × count (54 mults) + Complementary training: alpha=0.5 + Eval alpha: 0.20-0.75 | Orders: 2-9 +============================================ +W0326 04:40:46.217000 211893 torch/distributed/run.py:803] +W0326 04:40:46.217000 211893 torch/distributed/run.py:803] ***************************************** +W0326 04:40:46.217000 211893 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0326 04:40:46.217000 211893 torch/distributed/run.py:803] ***************************************** +logs/1c1f9bfa-928e-4bf9-ac68-3871d8996883.txt +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +complementary_training:alpha=0.5 +model_params:26928220 +f1_corr:rank=0 params=0 est_int6_bytes~0 +mlp_act:leaky_relu_sq mlp_leaky_slope:0.5 +XSA:last_4 world_size:8 grad_accum_steps:1 +num_heads:8 num_kv_heads:4 embed_lr:0.035 matrix_lr:0.025 +train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 +compile:enabled=1 fullgraph=0 +seed:300 +ngram_eval:order=9 alpha=0.3 min_count=2 buckets=8388608 +warmup_step:1/20 +warmup_step:2/20 +warmup_step:3/20 +warmup_step:4/20 +warmup_step:5/20 +warmup_step:6/20 +warmup_step:7/20 +warmup_step:8/20 +warmup_step:9/20 +warmup_step:10/20 +warmup_step:11/20 +warmup_step:12/20 +warmup_step:13/20 +warmup_step:14/20 +warmup_step:15/20 +warmup_step:16/20 +warmup_step:17/20 +warmup_step:18/20 +warmup_step:19/20 +warmup_step:20/20 +step:0/20000 val_loss:6.9327 val_bpb:4.1059 train_time:0ms step_avg:0.02ms +step:1/20000 train_loss:6.9337 train_time:147ms step_avg:146.80ms +step:2/20000 train_loss:8.6739 train_time:230ms step_avg:114.91ms +step:3/20000 train_loss:7.8308 train_time:316ms step_avg:105.30ms +step:4/20000 train_loss:7.0679 train_time:402ms step_avg:100.54ms +step:5/20000 train_loss:6.8781 train_time:488ms step_avg:97.56ms +step:6/20000 train_loss:6.7646 train_time:575ms step_avg:95.77ms +step:7/20000 train_loss:6.6175 train_time:660ms step_avg:94.33ms +step:8/20000 train_loss:6.5525 train_time:746ms step_avg:93.22ms +step:9/20000 train_loss:6.2961 train_time:832ms step_avg:92.40ms +step:10/20000 train_loss:5.9846 train_time:917ms step_avg:91.75ms +step:1000/20000 train_loss:2.2309 train_time:87923ms step_avg:87.92ms +step:2000/20000 train_loss:2.0271 train_time:176004ms step_avg:88.00ms +step:3000/20000 train_loss:2.1235 train_time:264103ms step_avg:88.03ms +step:4000/20000 train_loss:1.9370 train_time:352169ms step_avg:88.04ms +step:5000/20000 train_loss:2.0637 train_time:440259ms step_avg:88.05ms +late_qat:enabled step:5065 scale:0.4999 +step:6000/20000 train_loss:1.9062 train_time:528222ms step_avg:88.04ms +swa:start step:6200 +step:6814/20000 val_loss:1.9223 val_bpb:1.1385 train_time:600073ms step_avg:88.06ms +stopping_early: wallclock_cap train_time:600073ms step:6814/20000 +peak memory allocated: 20677 MiB reserved: 20716 MiB +gptq:calibrating with training data... +gptq:calibrated 68 layers in 3.5s +ema:applying EMA weights +DIAGNOSTIC post_ema val_loss:1.9207 val_bpb:1.1375 eval_time:2075ms +Serialized model: 106047497 bytes +Code size: 104697 bytes +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +Serialized model int6+zstd: 15555233 bytes +Total submission size int6+zstd: 15659930 bytes +Total submission size int8+zlib: 15659930 bytes +final_int6_roundtrip val_loss:1.9303 val_bpb:1.1432 eval_time:37052ms +final_int6_roundtrip_exact val_loss:1.93031471 val_bpb:1.14324110 +final_int6_sliding_window val_loss:1.8903 val_bpb:1.1196 stride:64 eval_time:95816ms +final_int6_sliding_window_exact val_loss:1.89033012 val_bpb:1.11956294 +final_int8_zlib_roundtrip_exact val_loss:1.89033012 val_bpb:1.11956294 +ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True +ngram_eval:chunk [1/60] bpb=1.129515 t=15s +ngram_eval:chunk [2/60] bpb=1.165073 t=19s +ngram_eval:chunk [3/60] bpb=1.167624 t=23s +cubric3d:step=8 o2:avg=0.42 o3:avg=0.30 o4:avg=0.44 o5:avg=1.91 o6:avg=1.93 o7:avg=1.92 o8:avg=1.91 o9:avg=1.95 +ngram_eval:chunk [11/60] bpb=1.044160 t=51s +cubric3d:step=16 o2:avg=0.39 o3:avg=0.30 o4:avg=0.44 o5:avg=1.80 o6:avg=1.80 o7:avg=1.80 o8:avg=1.80 o9:avg=1.85 +ngram_eval:chunk [21/60] bpb=0.811698 t=83s +cubric3d:step=24 o2:avg=0.39 o3:avg=0.30 o4:avg=0.46 o5:avg=1.70 o6:avg=1.71 o7:avg=1.70 o8:avg=1.71 o9:avg=1.74 +ngram_eval:chunk [31/60] bpb=0.666677 t=112s +cubric3d:step=32 o2:avg=0.39 o3:avg=0.30 o4:avg=0.45 o5:avg=1.65 o6:avg=1.68 o7:avg=1.66 o8:avg=1.64 o9:avg=1.65 +cubric3d:step=40 o2:avg=0.39 o3:avg=0.30 o4:avg=0.45 o5:avg=1.64 o6:avg=1.67 o7:avg=1.64 o8:avg=1.59 o9:avg=1.59 +ngram_eval:chunk [41/60] bpb=0.574203 t=139s +cubric3d:step=48 o2:avg=0.39 o3:avg=0.30 o4:avg=0.45 o5:avg=1.64 o6:avg=1.67 o7:avg=1.65 o8:avg=1.60 o9:avg=1.54 +ngram_eval:chunk [51/60] bpb=0.515402 t=165s +cubric3d:step=56 o2:avg=0.39 o3:avg=0.30 o4:avg=0.45 o5:avg=1.64 o6:avg=1.67 o7:avg=1.64 o8:avg=1.60 o9:avg=1.51 +ngram_eval:chunk [60/60] bpb=0.481137 t=199s +cubric3d:final c_steps=60 cells=9x8=72 + o2: [0.44 0.40 0.30 0.45 0.42 0.30 0.45 0.45 0.34] + o3: [0.30 0.30 0.30 0.30 0.30 0.30 0.31 0.30 0.30] + o4: [0.46 0.30 0.30 0.66 0.42 0.30 0.51 0.70 0.41] + o5: [1.87 0.88 0.91 2.00 1.94 1.15 2.00 2.00 2.00] + o6: [1.94 0.73 0.96 2.00 2.00 1.39 2.00 2.00 2.00] + o7: [1.87 0.44 1.05 2.00 2.00 1.39 2.00 2.00 2.00] + o8: [2.00 0.36 0.71 2.00 2.00 1.26 2.00 2.00 2.00] + o9: [2.00 0.40 0.49 2.00 2.00 0.51 2.00 2.00 2.00] +final_int6_sliding_window_ngram9 val_loss:0.8140 val_bpb:0.4821 eval_time:204025ms +final_int6_sliding_window_ngram9_exact val_loss:0.81402600 val_bpb:0.48211332 +============================================ + DONE +============================================ diff --git a/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed58.log b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed58.log new file mode 100644 index 000000000..ae29eeb6a --- /dev/null +++ b/records/track_10min_16mb/2026-03-26_XWING_Cubric3D_complementary_8xH100/train_seed58.log @@ -0,0 +1,120 @@ +============================================ + X-WING YELLOW II — THE MONSTER + Seed: 58 + 3D cubric: order × entropy × count (54 mults) + Complementary training: alpha=0.5 + Eval alpha: 0.20-0.75 | Orders: 2-9 +============================================ +W0326 05:01:36.516000 289626 torch/distributed/run.py:803] +W0326 05:01:36.516000 289626 torch/distributed/run.py:803] ***************************************** +W0326 05:01:36.516000 289626 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W0326 05:01:36.516000 289626 torch/distributed/run.py:803] ***************************************** +logs/5f9c0078-55b3-41d5-983b-931ec0d64466.txt +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +complementary_training:alpha=0.5 +model_params:26928220 +f1_corr:rank=0 params=0 est_int6_bytes~0 +mlp_act:leaky_relu_sq mlp_leaky_slope:0.5 +XSA:last_4 world_size:8 grad_accum_steps:1 +num_heads:8 num_kv_heads:4 embed_lr:0.035 matrix_lr:0.025 +train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 +compile:enabled=1 fullgraph=0 +seed:58 +ngram_eval:order=9 alpha=0.3 min_count=2 buckets=8388608 +warmup_step:1/20 +warmup_step:2/20 +warmup_step:3/20 +warmup_step:4/20 +warmup_step:5/20 +warmup_step:6/20 +warmup_step:7/20 +warmup_step:8/20 +warmup_step:9/20 +warmup_step:10/20 +warmup_step:11/20 +warmup_step:12/20 +warmup_step:13/20 +warmup_step:14/20 +warmup_step:15/20 +warmup_step:16/20 +warmup_step:17/20 +warmup_step:18/20 +warmup_step:19/20 +warmup_step:20/20 +step:0/20000 val_loss:6.9292 val_bpb:4.1038 train_time:0ms step_avg:0.02ms +step:1/20000 train_loss:6.9323 train_time:150ms step_avg:149.98ms +step:2/20000 train_loss:8.5353 train_time:232ms step_avg:115.83ms +step:3/20000 train_loss:7.7696 train_time:318ms step_avg:105.98ms +step:4/20000 train_loss:7.1228 train_time:404ms step_avg:100.94ms +step:5/20000 train_loss:6.8956 train_time:490ms step_avg:97.91ms +step:6/20000 train_loss:6.7754 train_time:575ms step_avg:95.79ms +step:7/20000 train_loss:6.6672 train_time:660ms step_avg:94.35ms +step:8/20000 train_loss:6.5588 train_time:746ms step_avg:93.26ms +step:9/20000 train_loss:6.2502 train_time:832ms step_avg:92.43ms +step:10/20000 train_loss:5.9694 train_time:917ms step_avg:91.75ms +step:1000/20000 train_loss:2.2401 train_time:87780ms step_avg:87.78ms +step:2000/20000 train_loss:2.0342 train_time:175741ms step_avg:87.87ms +step:3000/20000 train_loss:2.1263 train_time:263719ms step_avg:87.91ms +step:4000/20000 train_loss:1.9394 train_time:351634ms step_avg:87.91ms +step:5000/20000 train_loss:2.0677 train_time:439616ms step_avg:87.92ms +late_qat:enabled step:5075 scale:0.4999 +step:6000/20000 train_loss:1.9068 train_time:527519ms step_avg:87.92ms +swa:start step:6200 +step:6822/20000 val_loss:1.9241 val_bpb:1.1396 train_time:600033ms step_avg:87.96ms +stopping_early: wallclock_cap train_time:600033ms step:6822/20000 +peak memory allocated: 20677 MiB reserved: 20716 MiB +gptq:calibrating with training data... +gptq:calibrated 68 layers in 3.4s +ema:applying EMA weights +DIAGNOSTIC post_ema val_loss:1.9225 val_bpb:1.1386 eval_time:2218ms +Serialized model: 106047497 bytes +Code size: 104697 bytes +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +gptq_quantize: 66 GPTQ layers, 0 naive layers +Serialized model int6+zstd: 15489292 bytes +Total submission size int6+zstd: 15593989 bytes +Total submission size int8+zlib: 15593989 bytes +final_int6_roundtrip val_loss:1.9320 val_bpb:1.1442 eval_time:36972ms +final_int6_roundtrip_exact val_loss:1.93201278 val_bpb:1.14424679 +final_int6_sliding_window val_loss:1.8921 val_bpb:1.1206 stride:64 eval_time:96025ms +final_int6_sliding_window_exact val_loss:1.89209603 val_bpb:1.12060881 +final_int8_zlib_roundtrip_exact val_loss:1.89209603 val_bpb:1.12060881 +ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True +ngram_eval:chunk [1/60] bpb=1.131711 t=15s +ngram_eval:chunk [2/60] bpb=1.166999 t=19s +ngram_eval:chunk [3/60] bpb=1.169187 t=22s +cubric3d:step=8 o2:avg=0.42 o3:avg=0.30 o4:avg=0.45 o5:avg=1.92 o6:avg=1.94 o7:avg=1.91 o8:avg=1.91 o9:avg=1.96 +ngram_eval:chunk [11/60] bpb=1.045790 t=51s +cubric3d:step=16 o2:avg=0.39 o3:avg=0.30 o4:avg=0.45 o5:avg=1.80 o6:avg=1.78 o7:avg=1.81 o8:avg=1.78 o9:avg=1.87 +ngram_eval:chunk [21/60] bpb=0.812881 t=83s +cubric3d:step=24 o2:avg=0.39 o3:avg=0.30 o4:avg=0.47 o5:avg=1.70 o6:avg=1.69 o7:avg=1.71 o8:avg=1.69 o9:avg=1.76 +ngram_eval:chunk [31/60] bpb=0.667590 t=111s +cubric3d:step=32 o2:avg=0.39 o3:avg=0.30 o4:avg=0.47 o5:avg=1.62 o6:avg=1.65 o7:avg=1.65 o8:avg=1.62 o9:avg=1.66 +cubric3d:step=40 o2:avg=0.39 o3:avg=0.30 o4:avg=0.47 o5:avg=1.61 o6:avg=1.65 o7:avg=1.62 o8:avg=1.58 o9:avg=1.58 +ngram_eval:chunk [41/60] bpb=0.574991 t=138s +cubric3d:step=48 o2:avg=0.39 o3:avg=0.30 o4:avg=0.47 o5:avg=1.61 o6:avg=1.64 o7:avg=1.63 o8:avg=1.59 o9:avg=1.55 +ngram_eval:chunk [51/60] bpb=0.515968 t=164s +cubric3d:step=56 o2:avg=0.39 o3:avg=0.30 o4:avg=0.47 o5:avg=1.61 o6:avg=1.64 o7:avg=1.62 o8:avg=1.59 o9:avg=1.51 +ngram_eval:chunk [60/60] bpb=0.481474 t=197s +cubric3d:final c_steps=60 cells=9x8=72 + o2: [0.44 0.41 0.30 0.45 0.41 0.30 0.45 0.45 0.33] + o3: [0.30 0.30 0.30 0.30 0.30 0.30 0.31 0.30 0.30] + o4: [0.45 0.30 0.30 0.72 0.41 0.30 0.59 0.70 0.46] + o5: [1.76 0.88 0.88 1.88 2.00 1.09 2.00 2.00 2.00] + o6: [1.87 0.71 0.96 2.00 2.00 1.23 2.00 2.00 2.00] + o7: [1.66 0.46 1.05 2.00 2.00 1.39 2.00 2.00 2.00] + o8: [2.00 0.36 0.73 2.00 2.00 1.15 2.00 2.00 2.00] + o9: [2.00 0.40 0.54 2.00 2.00 0.49 2.00 2.00 2.00] +final_int6_sliding_window_ngram9 val_loss:0.8140 val_bpb:0.4821 eval_time:203420ms +final_int6_sliding_window_ngram9_exact val_loss:0.81396160 val_bpb:0.48207518 +============================================ + DONE +============================================