AI-Hypercomputer
diff --git a/‎.github/workflows/UnitTests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/UnitTests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxdiffusion/common_types.py‎
Lines changed: 33 additions & 1 deletion b/‎src/maxdiffusion/common_types.py‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎src/maxdiffusion/configs/base14.yml‎
Lines changed: 9 additions & 0 deletions b/‎src/maxdiffusion/configs/base14.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base21.yml‎
Lines changed: 10 additions & 0 deletions b/‎src/maxdiffusion/configs/base21.yml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_2_base.yml‎
Lines changed: 10 additions & 0 deletions b/‎src/maxdiffusion/configs/base_2_base.yml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_flux_dev.yml‎
Lines changed: 9 additions & 0 deletions b/‎src/maxdiffusion/configs/base_flux_dev.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_flux_dev_multi_res.yml‎
Lines changed: 9 additions & 0 deletions b/‎src/maxdiffusion/configs/base_flux_dev_multi_res.yml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_flux_schnell.yml‎
Lines changed: 9 additions & 0 deletions b/‎src/maxdiffusion/configs/base_flux_schnell.yml‎
Lines changed: 9 additions & 0 deletions
@@ -58,7 +58,7 @@ jobs:
         pip show jax jaxlib flax transformers datasets tensorflow tensorflow_datasets
     - name: PyTest
       run: | #--deselect=src/maxdiffusion/tests/input_pipeline_interface_test.py
-        HF_HUB_CACHE=/mnt/disks/github-runner-disk/ HF_HOME=/mnt/disks/github-runner-disk/ TOKENIZERS_PARALLELISM=false python3 -m pytest --deselect=src/maxdiffusion/tests/ltx_transformer_step_test.py -x
+        HF_HUB_CACHE=/mnt/disks/github-runner-disk/ HF_HOME=/mnt/disks/github-runner-disk/ TOKENIZERS_PARALLELISM=false LIBTPU_INIT_ARGS="--xla_tpu_scoped_vmem_limit_kib=65472" python3 -m pytest --deselect=src/maxdiffusion/tests/ltx_transformer_step_test.py -x
 #  add_pull_ready:
 #    if: github.ref != 'refs/heads/main'
 #    permissions:
 
@@ -4,7 +4,6 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
 # C extensions
 *.so
 
@@ -98,6 +97,7 @@ celerybeat-schedule
 
 # Environments
 .env
+.history
 .venv
 env/
 venv/
 
@@ -13,6 +13,7 @@ ftfy
 tensorboard>=2.17.0
 tensorboardx>=2.6.2.2
 tensorboard-plugin-profile>=2.15.2
+tokamax
 Jinja2
 scikit-image
 parameterized
 
@@ -33,7 +33,11 @@
 BlockSizes = splash_attention_kernel.BlockSizes
 
 AxisNames = tuple[str, ...]
-
+# Physical axis names for device meshes.
+DATA = "data"
+FSDP = "fsdp"
+TENSOR = "tensor"
+# Logical axis names for model parameters and activations.
 BATCH = "activation_batch"
 LENGTH = "activation_length"
 KV_LENGTH = "activation_kv_length"
@@ -44,4 +48,32 @@
 KEEP_2 = "activation_keep_2"
 CONV_OUT = "activation_conv_out_channels"
 
+# For setting self/cross attention independently in splash kernel
+SELF_ATTN_HEAD = "activation_self_attn_heads"
+SELF_ATTN_Q_LENGTH = "activation_self_attn_q_length"
+SELF_ATTN_KV_LENGTH = "activation_self_attn_kv_length"
+CROSS_ATTN_HEAD = "activation_cross_attn_heads"
+CROSS_ATTN_Q_LENGTH = "activation_cross_attn_q_length"
+CROSS_ATTN_KV_LENGTH = "activation_cross_attn_kv_length"
+
+
 WAN_MODEL = "Wan2.1"
+
+### Common axis rules for ring attention ###
+RING_ATTENTION_AXIS_RULES = [
+        [SELF_ATTN_HEAD, None],
+        [SELF_ATTN_Q_LENGTH, FSDP],
+        [SELF_ATTN_KV_LENGTH, FSDP],
+        [CROSS_ATTN_HEAD, None],
+        [CROSS_ATTN_Q_LENGTH, FSDP],
+        [CROSS_ATTN_KV_LENGTH, FSDP],
+]
+
+SEQUENCE_PARALLEL_AXIS_RULES = [
+        [SELF_ATTN_HEAD, None],
+        [SELF_ATTN_Q_LENGTH, FSDP],
+        [SELF_ATTN_KV_LENGTH, None],
+        [CROSS_ATTN_HEAD, None],
+        [CROSS_ATTN_Q_LENGTH, FSDP],
+        [CROSS_ATTN_KV_LENGTH, None],
+]
@@ -50,6 +50,15 @@ jit_initializers: True
 from_pt: False
 split_head_dim: True
 attention: 'dot_product' # Supported attention: dot_product, flash
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 flash_block_sizes: {}
 # GroupNorm groups
 norm_num_groups: 32
 
@@ -49,6 +49,16 @@ jit_initializers: True
 from_pt: False
 split_head_dim: True
 attention: 'dot_product' # Supported attention: dot_product, flash
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
+
 flash_block_sizes: {}
 # GroupNorm groups
 norm_num_groups: 32
 
@@ -50,6 +50,16 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
+
 flash_block_sizes: {}
 # to override default block sizes for flash attention
 # flash_block_sizes:
 
@@ -63,6 +63,15 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 
 flash_block_sizes: {}
 # Use the following flash_block_sizes on v6e (Trillium) due to larger vmem.
 
@@ -63,6 +63,15 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 
 #flash_block_sizes: {}
 # Use the following flash_block_sizes on v6e (Trillium) due to larger vmem.
 
@@ -62,6 +62,15 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
+# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
+# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
+# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
+mask_padding_tokens: True 
+# Maxdiffusion has 2 types of attention sharding strategies:
+# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
+# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
+#    in cross attention q.
+attention_sharding_uniform: True 
 flash_block_sizes: {
   "block_q" : 256,
   "block_kv_compute" : 256,