openai · newjordan · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/experiments/A_wing/green/run.sh b/experiments/A_wing/green/run.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+set -euo pipefail
+# A-WING GREEN: INT5 GPTQ (clip_range=15 vs INT6 clip_range=31)
+# Base: bwing_IV (9-prime fix + fixed mults + entropy shift)
+# Theory: more quant noise → higher entropy → n-gram rescues harder (#809 uses INT5)
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}"
+
+SEED="${SEED:-1337}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
+
+echo "============================================"
+echo "  A-WING GREEN — INT5 GPTQ + 9-Prime"
+echo "  Seed: ${SEED}"
+echo "  GPTQ INT5 (clip_range=15), 9 hash primes"
+echo "  Fixed mults + entropy shift, no cubric"
+echo "============================================"
+
+SEED="$SEED" \
+F1_CORR_RANK=0 \
+DISTILL_ENABLED=0 \
+MLP_ACT=leaky_relu_sq \
+MLP_LEAKY_SLOPE=0.5 \
+XSA_LAST_N=4 \
+BIGRAM_VOCAB_SIZE=1536 \
+TTT_EVAL_ENABLED=0 \
+ROPE_DIMS=24 \
+VAL_LOSS_EVERY=20000 \
+TRAIN_LOG_EVERY=1000 \
+SWA_EVERY=100 \
+COMPLEMENT_ALPHA=0.5 \
+NGRAM_EVAL_ORDER=9 \
+NGRAM_EVAL_MIN_ORDER=2 \
+NGRAM_EVAL_ADAPTIVE=1 \
+NGRAM_EVAL_ALPHA=0.30 \
+NGRAM_EVAL_ALPHA_MIN=0.05 \
+NGRAM_EVAL_ALPHA_MAX=0.60 \
+NGRAM_EVAL_ENTROPY_CENTER=3.0 \
+NGRAM_EVAL_ENTROPY_SCALE=2.0 \
+NGRAM_EVAL_MIN_COUNT=2 \
+NGRAM_EVAL_BUCKETS=8388608 \
+NGRAM_EVAL_MAX_SECONDS=0 \
+CUBRIC_CADENCE=0 \
+NGRAM_ENTROPY_SHIFT=1 \
+NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \
+COMPILE_FULLGRAPH=0 \
+torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \
+    "${SCRIPT_DIR}/train_gpt.py" \
+    2>&1 | tee "logs/awing_green_s${SEED}_$(date +%Y%m%d_%H%M%S).log"
+
+echo "============================================"
+echo "  DONE"
+echo "============================================"
diff --git a/experiments/A_wing/green/train_gpt.py b/experiments/A_wing/green/train_gpt.py
diff --git a/experiments/A_wing/green/train_seed1337.log b/experiments/A_wing/green/train_seed1337.log
@@ -0,0 +1,103 @@
+============================================
+  A-WING GREEN — INT5 GPTQ + 9-Prime
+  Seed: 1337
+  GPTQ INT5 (clip_range=15), 9 hash primes
+  Fixed mults + entropy shift, no cubric
+============================================
+W0326 07:30:47.033000 2016 torch/distributed/run.py:803]
+W0326 07:30:47.033000 2016 torch/distributed/run.py:803] *****************************************
+W0326 07:30:47.033000 2016 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W0326 07:30:47.033000 2016 torch/distributed/run.py:803] *****************************************
+logs/dff55565-90ac-4982-824c-0cb07ccacd65.txt
+val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model
+train_loader:dataset:fineweb10B_sp1024 train_shards:80
+val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632
+complementary_training:alpha=0.5
+model_params:26928220
+f1_corr:rank=0 params=0 est_int6_bytes~0
+mlp_act:leaky_relu_sq mlp_leaky_slope:0.5
+XSA:last_4 world_size:8 grad_accum_steps:1
+num_heads:8 num_kv_heads:4 embed_lr:0.035 matrix_lr:0.025
+train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000
+compile:enabled=1 fullgraph=0
+seed:1337
+ngram_eval:order=9 alpha=0.3 min_count=2 buckets=8388608
+warmup_step:1/20
+warmup_step:2/20
+warmup_step:3/20
+warmup_step:4/20
+warmup_step:5/20
+warmup_step:6/20
+warmup_step:7/20
+warmup_step:8/20
+warmup_step:9/20
+warmup_step:10/20
+warmup_step:11/20
+warmup_step:12/20
+warmup_step:13/20
+warmup_step:14/20
+warmup_step:15/20
+warmup_step:16/20
+warmup_step:17/20
+warmup_step:18/20
+warmup_step:19/20
+warmup_step:20/20
+step:0/20000 val_loss:6.9317 val_bpb:4.1054 train_time:0ms step_avg:0.02ms
+step:1/20000 train_loss:6.9343 train_time:143ms step_avg:143.41ms
+step:2/20000 train_loss:8.6212 train_time:226ms step_avg:113.04ms
+step:3/20000 train_loss:7.8209 train_time:312ms step_avg:104.12ms
+step:4/20000 train_loss:7.1064 train_time:398ms step_avg:99.50ms
+step:5/20000 train_loss:6.8529 train_time:485ms step_avg:96.90ms
+step:6/20000 train_loss:6.7961 train_time:570ms step_avg:94.93ms
+step:7/20000 train_loss:6.6784 train_time:656ms step_avg:93.68ms
+step:8/20000 train_loss:6.5596 train_time:742ms step_avg:92.71ms
+step:9/20000 train_loss:6.2552 train_time:827ms step_avg:91.94ms
+step:10/20000 train_loss:5.9363 train_time:913ms step_avg:91.32ms
+step:1000/20000 train_loss:2.2345 train_time:87847ms step_avg:87.85ms
+step:2000/20000 train_loss:2.0285 train_time:175893ms step_avg:87.95ms
+step:3000/20000 train_loss:2.1264 train_time:263985ms step_avg:87.99ms
+step:4000/20000 train_loss:1.9367 train_time:352016ms step_avg:88.00ms
+step:5000/20000 train_loss:2.0641 train_time:440120ms step_avg:88.02ms
+late_qat:enabled step:5067 scale:0.4999
+step:6000/20000 train_loss:1.9070 train_time:528137ms step_avg:88.02ms
+swa:start step:6200
+step:6814/20000 val_loss:1.9225 val_bpb:1.1386 train_time:600027ms step_avg:88.06ms
+stopping_early: wallclock_cap train_time:600027ms step:6814/20000
+peak memory allocated: 20677 MiB reserved: 20716 MiB
+gptq:calibrating with training data...
+gptq:calibrated 68 layers in 3.4s
+ema:applying EMA weights
+DIAGNOSTIC post_ema val_loss:1.9208 val_bpb:1.1376 eval_time:2240ms
+Serialized model: 106047497 bytes
+Code size: 106202 bytes
+gptq_quantize: 66 GPTQ layers, 0 naive layers
+gptq_quantize: 66 GPTQ layers, 0 naive layers
+gptq_quantize: 66 GPTQ layers, 0 naive layers
+gptq_quantize: 66 GPTQ layers, 0 naive layers
+gptq_quantize: 66 GPTQ layers, 0 naive layers
+gptq_quantize: 66 GPTQ layers, 0 naive layers
+gptq_quantize: 66 GPTQ layers, 0 naive layers
+gptq_quantize: 66 GPTQ layers, 0 naive layers
+Serialized model int5+zlib: 13666914 bytes
+Total submission size int5+zlib: 13773116 bytes
+Total submission size int8+zlib: 13773116 bytes
+final_int5_roundtrip val_loss:1.9689 val_bpb:1.1661 eval_time:37008ms
+final_int5_roundtrip_exact val_loss:1.96888819 val_bpb:1.16608649
+final_int5_sliding_window val_loss:1.9264 val_bpb:1.1410 stride:64 eval_time:96465ms
+final_int5_sliding_window_exact val_loss:1.92644292 val_bpb:1.14095103
+final_int8_zlib_roundtrip_exact val_loss:1.92644292 val_bpb:1.14095103
+ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True
+ngram_eval:chunk [1/60] bpb=1.152801 t=15s
+ngram_eval:chunk [2/60] bpb=1.232931 t=18s
+ngram_eval:chunk [3/60] bpb=1.257240 t=21s
+ngram_eval:chunk [11/60] bpb=1.168507 t=43s
+ngram_eval:chunk [21/60] bpb=0.891224 t=69s
+ngram_eval:chunk [31/60] bpb=0.705693 t=95s
+ngram_eval:chunk [41/60] bpb=0.584660 t=119s
+ngram_eval:chunk [51/60] bpb=0.505440 t=144s
+ngram_eval:chunk [60/60] bpb=0.457581 t=176s
+final_int5_sliding_window_ngram9 val_loss:0.7726 val_bpb:0.4576 eval_time:176713ms
+final_int5_sliding_window_ngram9_exact val_loss:0.77264878 val_bpb:0.45760734
+============================================
+  DONE
+============================================
diff --git a/experiments/A_wing/green_1/run.sh b/experiments/A_wing/green_1/run.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+set -euo pipefail
+# A-WING GREEN_1: Oracle Alpha + 9-Prime Hash Fix
+# Instead of entropy-adaptive alpha, directly compare model_p vs ngram_p
+# per token. Soft sigmoid on log-ratio (steepness=8), clip 0.95.
+# Base: SOTA bwing_full_port (0.4512 BPB)
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)"
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}"
+
+SEED="${SEED:-1337}"
+NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
+
+# --- Pre-flight checks ---
+echo "[preflight] checking zstandard..."
+python3 -c "import zstandard; print(f'  zstandard {zstandard.__version__} OK')" 2>/dev/null \
+    || { echo "  FATAL: zstandard not found. pip install zstandard"; exit 1; }
+
+echo "[preflight] checking flash_attn..."
+python3 -c "
+try:
+    import flash_attn_interface; print('  FA3 (hopper) OK')
+except ImportError:
+    import flash_attn; v=flash_attn.__version__
+    if v.startswith('3'): print(f'  FA3 v{v} OK')
+    else: print(f'  WARNING: FA{v[0]} detected — want FA3')
+" 2>/dev/null || echo "  WARNING: no flash_attn found"
+
+echo "============================================"
+echo "  A-WING GREEN_1 — Oracle Alpha + 9-Prime"
+echo "  Seed: ${SEED}"
+echo "  Oracle: alpha = sigmoid(8 * log(ngram_p/model_p)) * 0.95"
+echo "  9 hash primes, INT6, no cubric"
+echo "  Training cap: 570s (30s reserved for GPTQ)"
+echo "============================================"
+
+SEED="$SEED" \
+F1_CORR_RANK=0 \
+DISTILL_ENABLED=0 \
+MLP_ACT=leaky_relu_sq \
+MLP_LEAKY_SLOPE=0.5 \
+XSA_LAST_N=4 \
+BIGRAM_VOCAB_SIZE=1536 \
+TTT_EVAL_ENABLED=0 \
+ROPE_DIMS=24 \
+VAL_LOSS_EVERY=20000 \
+TRAIN_LOG_EVERY=1000 \
+SWA_EVERY=100 \
+COMPLEMENT_ALPHA=0.5 \
+NGRAM_EVAL_ORDER=9 \
+NGRAM_EVAL_MIN_ORDER=2 \
+NGRAM_EVAL_ADAPTIVE=1 \
+NGRAM_EVAL_ALPHA=0.30 \
+NGRAM_EVAL_ALPHA_MIN=0.05 \
+NGRAM_EVAL_ALPHA_MAX=0.60 \
+NGRAM_EVAL_ENTROPY_CENTER=3.0 \
+NGRAM_EVAL_ENTROPY_SCALE=2.0 \
+NGRAM_EVAL_MIN_COUNT=2 \
+NGRAM_EVAL_BUCKETS=8388608 \
+NGRAM_EVAL_MAX_SECONDS=0 \
+CUBRIC_CADENCE=0 \
+NGRAM_ENTROPY_SHIFT=1 \
+NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \
+MAX_WALLCLOCK_SECONDS=570 \
+COMPILE_FULLGRAPH=0 \
+torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \
+    "${SCRIPT_DIR}/train_gpt.py" \
+    2>&1 | tee "logs/awing_green1_s${SEED}_$(date +%Y%m%d_%H%M%S).log"
+
+echo "============================================"
+echo "  DONE"
+echo "============================================"