Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
4ce0d59
X-WING 3D Cubric: 0.4820 BPB (3-seed mean, std 0.0002)
Mar 26, 2026
6c49da3
B-wing lab: port PR #809 n-gram techniques onto X-WING base
Mar 26, 2026
bee0716
B-wing II: cubric ON + entropy shift + fast TTT
Mar 26, 2026
d6d281a
B-wing III: LoRA TTT from #809 + cubric ON + all n-gram fixes
Mar 26, 2026
137432f
Record bwing_full_port seed 1337: 0.4512 BPB
Mar 26, 2026
94bb107
Replace bwing_III with copy of SOTA bwing_full_port (0.4512 BPB)
Mar 26, 2026
2c0c0ee
B-wing IV + V: fix 7→9 hash primes (order 8-9 collision bug)
Mar 26, 2026
3ebaf38
Add B-wing pod setup script (FA3 + zstandard + sp1024)
Mar 26, 2026
5a21365
Add n-gram parameter grid sweep for bwing_V
Mar 26, 2026
75dbe40
A-Wing Green: INT5 GPTQ (clip_range=15) + 9-prime hash fix
Mar 26, 2026
22eae2a
A-Wing Green: strip TTT, cubric, F1 correction, distillation
Mar 26, 2026
d6cb709
Record results: A-Wing Green 0.4576, bwing_V 0.4601
Mar 26, 2026
c37a8ab
A-Wing Green_1: Oracle Alpha — use model_p vs ngram_p directly
Mar 26, 2026
08d6b7c
Green_1: cap training at 570s to fit GPTQ in 600s budget
Mar 26, 2026
d8b6022
Green_1: add preflight checks (zstd, FA3) + zstd import warning
Mar 26, 2026
b1d45b8
A-Wing Green_2: Oracle Alpha + LoRA TTT + 9-Prime
Mar 26, 2026
88ec4ca
Fix pod setup: use system Python, no conda/PYTHONPATH hacks
Mar 26, 2026
5876cf5
NEW SOTA 0.3200 BPB: A-Wing Green_1 Oracle Alpha + 9-Prime
Mar 26, 2026
da832ba
A-Wing Purple: Learned Mixer Head for legal n-gram ceiling
Mar 26, 2026
2b38218
Add pod_launch.sh: one command for clone + setup + run
Mar 26, 2026
a37d7c3
Fix pod_launch.sh: pull from private repo (fork1), not public
Mar 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions experiments/A_wing/green/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash
set -euo pipefail
# A-WING GREEN: INT5 GPTQ (clip_range=15 vs INT6 clip_range=31)
# Base: bwing_IV (9-prime fix + fixed mults + entropy shift)
# Theory: more quant noise → higher entropy → n-gram rescues harder (#809 uses INT5)

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)"
cd "${REPO_ROOT}"
export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}"

SEED="${SEED:-1337}"
NPROC_PER_NODE="${NPROC_PER_NODE:-8}"

echo "============================================"
echo " A-WING GREEN — INT5 GPTQ + 9-Prime"
echo " Seed: ${SEED}"
echo " GPTQ INT5 (clip_range=15), 9 hash primes"
echo " Fixed mults + entropy shift, no cubric"
echo "============================================"

SEED="$SEED" \
F1_CORR_RANK=0 \
DISTILL_ENABLED=0 \
MLP_ACT=leaky_relu_sq \
MLP_LEAKY_SLOPE=0.5 \
XSA_LAST_N=4 \
BIGRAM_VOCAB_SIZE=1536 \
TTT_EVAL_ENABLED=0 \
ROPE_DIMS=24 \
VAL_LOSS_EVERY=20000 \
TRAIN_LOG_EVERY=1000 \
SWA_EVERY=100 \
COMPLEMENT_ALPHA=0.5 \
NGRAM_EVAL_ORDER=9 \
NGRAM_EVAL_MIN_ORDER=2 \
NGRAM_EVAL_ADAPTIVE=1 \
NGRAM_EVAL_ALPHA=0.30 \
NGRAM_EVAL_ALPHA_MIN=0.05 \
NGRAM_EVAL_ALPHA_MAX=0.60 \
NGRAM_EVAL_ENTROPY_CENTER=3.0 \
NGRAM_EVAL_ENTROPY_SCALE=2.0 \
NGRAM_EVAL_MIN_COUNT=2 \
NGRAM_EVAL_BUCKETS=8388608 \
NGRAM_EVAL_MAX_SECONDS=0 \
CUBRIC_CADENCE=0 \
NGRAM_ENTROPY_SHIFT=1 \
NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \
COMPILE_FULLGRAPH=0 \
torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \
"${SCRIPT_DIR}/train_gpt.py" \
2>&1 | tee "logs/awing_green_s${SEED}_$(date +%Y%m%d_%H%M%S).log"

echo "============================================"
echo " DONE"
echo "============================================"
1,936 changes: 1,936 additions & 0 deletions experiments/A_wing/green/train_gpt.py

Large diffs are not rendered by default.

103 changes: 103 additions & 0 deletions experiments/A_wing/green/train_seed1337.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
============================================
A-WING GREEN — INT5 GPTQ + 9-Prime
Seed: 1337
GPTQ INT5 (clip_range=15), 9 hash primes
Fixed mults + entropy shift, no cubric
============================================
W0326 07:30:47.033000 2016 torch/distributed/run.py:803]
W0326 07:30:47.033000 2016 torch/distributed/run.py:803] *****************************************
W0326 07:30:47.033000 2016 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0326 07:30:47.033000 2016 torch/distributed/run.py:803] *****************************************
logs/dff55565-90ac-4982-824c-0cb07ccacd65.txt
val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=./data/tokenizers/fineweb_1024_bpe.model
train_loader:dataset:fineweb10B_sp1024 train_shards:80
val_loader:shards pattern=./data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632
complementary_training:alpha=0.5
model_params:26928220
f1_corr:rank=0 params=0 est_int6_bytes~0
mlp_act:leaky_relu_sq mlp_leaky_slope:0.5
XSA:last_4 world_size:8 grad_accum_steps:1
num_heads:8 num_kv_heads:4 embed_lr:0.035 matrix_lr:0.025
train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000
compile:enabled=1 fullgraph=0
seed:1337
ngram_eval:order=9 alpha=0.3 min_count=2 buckets=8388608
warmup_step:1/20
warmup_step:2/20
warmup_step:3/20
warmup_step:4/20
warmup_step:5/20
warmup_step:6/20
warmup_step:7/20
warmup_step:8/20
warmup_step:9/20
warmup_step:10/20
warmup_step:11/20
warmup_step:12/20
warmup_step:13/20
warmup_step:14/20
warmup_step:15/20
warmup_step:16/20
warmup_step:17/20
warmup_step:18/20
warmup_step:19/20
warmup_step:20/20
step:0/20000 val_loss:6.9317 val_bpb:4.1054 train_time:0ms step_avg:0.02ms
step:1/20000 train_loss:6.9343 train_time:143ms step_avg:143.41ms
step:2/20000 train_loss:8.6212 train_time:226ms step_avg:113.04ms
step:3/20000 train_loss:7.8209 train_time:312ms step_avg:104.12ms
step:4/20000 train_loss:7.1064 train_time:398ms step_avg:99.50ms
step:5/20000 train_loss:6.8529 train_time:485ms step_avg:96.90ms
step:6/20000 train_loss:6.7961 train_time:570ms step_avg:94.93ms
step:7/20000 train_loss:6.6784 train_time:656ms step_avg:93.68ms
step:8/20000 train_loss:6.5596 train_time:742ms step_avg:92.71ms
step:9/20000 train_loss:6.2552 train_time:827ms step_avg:91.94ms
step:10/20000 train_loss:5.9363 train_time:913ms step_avg:91.32ms
step:1000/20000 train_loss:2.2345 train_time:87847ms step_avg:87.85ms
step:2000/20000 train_loss:2.0285 train_time:175893ms step_avg:87.95ms
step:3000/20000 train_loss:2.1264 train_time:263985ms step_avg:87.99ms
step:4000/20000 train_loss:1.9367 train_time:352016ms step_avg:88.00ms
step:5000/20000 train_loss:2.0641 train_time:440120ms step_avg:88.02ms
late_qat:enabled step:5067 scale:0.4999
step:6000/20000 train_loss:1.9070 train_time:528137ms step_avg:88.02ms
swa:start step:6200
step:6814/20000 val_loss:1.9225 val_bpb:1.1386 train_time:600027ms step_avg:88.06ms
stopping_early: wallclock_cap train_time:600027ms step:6814/20000
peak memory allocated: 20677 MiB reserved: 20716 MiB
gptq:calibrating with training data...
gptq:calibrated 68 layers in 3.4s
ema:applying EMA weights
DIAGNOSTIC post_ema val_loss:1.9208 val_bpb:1.1376 eval_time:2240ms
Serialized model: 106047497 bytes
Code size: 106202 bytes
gptq_quantize: 66 GPTQ layers, 0 naive layers
gptq_quantize: 66 GPTQ layers, 0 naive layers
gptq_quantize: 66 GPTQ layers, 0 naive layers
gptq_quantize: 66 GPTQ layers, 0 naive layers
gptq_quantize: 66 GPTQ layers, 0 naive layers
gptq_quantize: 66 GPTQ layers, 0 naive layers
gptq_quantize: 66 GPTQ layers, 0 naive layers
gptq_quantize: 66 GPTQ layers, 0 naive layers
Serialized model int5+zlib: 13666914 bytes
Total submission size int5+zlib: 13773116 bytes
Total submission size int8+zlib: 13773116 bytes
final_int5_roundtrip val_loss:1.9689 val_bpb:1.1661 eval_time:37008ms
final_int5_roundtrip_exact val_loss:1.96888819 val_bpb:1.16608649
final_int5_sliding_window val_loss:1.9264 val_bpb:1.1410 stride:64 eval_time:96465ms
final_int5_sliding_window_exact val_loss:1.92644292 val_bpb:1.14095103
final_int8_zlib_roundtrip_exact val_loss:1.92644292 val_bpb:1.14095103
ngram_eval:chunks=60 chunk_tokens=1048576 windows=969088 shared_tables=True
ngram_eval:chunk [1/60] bpb=1.152801 t=15s
ngram_eval:chunk [2/60] bpb=1.232931 t=18s
ngram_eval:chunk [3/60] bpb=1.257240 t=21s
ngram_eval:chunk [11/60] bpb=1.168507 t=43s
ngram_eval:chunk [21/60] bpb=0.891224 t=69s
ngram_eval:chunk [31/60] bpb=0.705693 t=95s
ngram_eval:chunk [41/60] bpb=0.584660 t=119s
ngram_eval:chunk [51/60] bpb=0.505440 t=144s
ngram_eval:chunk [60/60] bpb=0.457581 t=176s
final_int5_sliding_window_ngram9 val_loss:0.7726 val_bpb:0.4576 eval_time:176713ms
final_int5_sliding_window_ngram9_exact val_loss:0.77264878 val_bpb:0.45760734
============================================
DONE
============================================
74 changes: 74 additions & 0 deletions experiments/A_wing/green_1/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/bin/bash
set -euo pipefail
# A-WING GREEN_1: Oracle Alpha + 9-Prime Hash Fix
# Instead of entropy-adaptive alpha, directly compare model_p vs ngram_p
# per token. Soft sigmoid on log-ratio (steepness=8), clip 0.95.
# Base: SOTA bwing_full_port (0.4512 BPB)

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)"
cd "${REPO_ROOT}"
export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}"

SEED="${SEED:-1337}"
NPROC_PER_NODE="${NPROC_PER_NODE:-8}"

# --- Pre-flight checks ---
echo "[preflight] checking zstandard..."
python3 -c "import zstandard; print(f' zstandard {zstandard.__version__} OK')" 2>/dev/null \
|| { echo " FATAL: zstandard not found. pip install zstandard"; exit 1; }

echo "[preflight] checking flash_attn..."
python3 -c "
try:
import flash_attn_interface; print(' FA3 (hopper) OK')
except ImportError:
import flash_attn; v=flash_attn.__version__
if v.startswith('3'): print(f' FA3 v{v} OK')
else: print(f' WARNING: FA{v[0]} detected — want FA3')
" 2>/dev/null || echo " WARNING: no flash_attn found"

echo "============================================"
echo " A-WING GREEN_1 — Oracle Alpha + 9-Prime"
echo " Seed: ${SEED}"
echo " Oracle: alpha = sigmoid(8 * log(ngram_p/model_p)) * 0.95"
echo " 9 hash primes, INT6, no cubric"
echo " Training cap: 570s (30s reserved for GPTQ)"
echo "============================================"

SEED="$SEED" \
F1_CORR_RANK=0 \
DISTILL_ENABLED=0 \
MLP_ACT=leaky_relu_sq \
MLP_LEAKY_SLOPE=0.5 \
XSA_LAST_N=4 \
BIGRAM_VOCAB_SIZE=1536 \
TTT_EVAL_ENABLED=0 \
ROPE_DIMS=24 \
VAL_LOSS_EVERY=20000 \
TRAIN_LOG_EVERY=1000 \
SWA_EVERY=100 \
COMPLEMENT_ALPHA=0.5 \
NGRAM_EVAL_ORDER=9 \
NGRAM_EVAL_MIN_ORDER=2 \
NGRAM_EVAL_ADAPTIVE=1 \
NGRAM_EVAL_ALPHA=0.30 \
NGRAM_EVAL_ALPHA_MIN=0.05 \
NGRAM_EVAL_ALPHA_MAX=0.60 \
NGRAM_EVAL_ENTROPY_CENTER=3.0 \
NGRAM_EVAL_ENTROPY_SCALE=2.0 \
NGRAM_EVAL_MIN_COUNT=2 \
NGRAM_EVAL_BUCKETS=8388608 \
NGRAM_EVAL_MAX_SECONDS=0 \
CUBRIC_CADENCE=0 \
NGRAM_ENTROPY_SHIFT=1 \
NGRAM_ORDER_MULTS="0.3,0.3,0.97,2.0,2.0,2.0,2.0,2.0" \
MAX_WALLCLOCK_SECONDS=570 \
COMPILE_FULLGRAPH=0 \
torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \
"${SCRIPT_DIR}/train_gpt.py" \
2>&1 | tee "logs/awing_green1_s${SEED}_$(date +%Y%m%d_%H%M%S).log"

echo "============================================"
echo " DONE"
echo "============================================"
Loading