[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit aa78e6635a72 · 2025-10-16T12:50:30.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/jax/test_distributed_fused_attn.py b/tests/jax/test_distributed_fused_attn.py
@@ -618,9 +618,13 @@ def test_context_parallel_alltoall_attn(
         load_balanced,
     ):
         if data_shape[2] % (mesh_shape[1] * mesh_shape[2] * kv_groups) != 0:
-            pytest.skip("Skipping test as num_heads is not divisible by cp_size * tp_size * kv_groups")
+            pytest.skip(
+                "Skipping test as num_heads is not divisible by cp_size * tp_size * kv_groups"
+            )
         if load_balanced:
-            pytest.skip("Load balanced causal attention is not yet supported with all-to-all strategy")
+            pytest.skip(
+                "Load balanced causal attention is not yet supported with all-to-all strategy"
+            )
         self.impl_test_context_parallel_attn(
             device_count,
             mesh_shape,
diff --git a/transformer_engine/jax/cpp_extensions/attention.py b/transformer_engine/jax/cpp_extensions/attention.py
@@ -1678,7 +1678,7 @@ def impl(
             cp_size = get_mesh_axis_size(config.cp_axis, mesh)
             assert q_heads % cp_size == 0, "q_heads must be divisible by cp_size"
             assert kv_heads % cp_size == 0, "kv_heads must be divisible by cp_size"
-            
+
             # Load balanced causal attention is not yet supported for all-to-all strategy
             if config.context_parallel_load_balanced:
                 raise NotImplementedError(
@@ -1689,8 +1689,12 @@ def impl(
             q_ = helper.all_to_all(q, True, seq_dim=1, heads_dim=q_heads_dim)
             k_ = helper.all_to_all(k, True, seq_dim=1, heads_dim=k_heads_dim)
             # For KVPACKED layout, v is empty placeholder
-            v_ = v if v.shape[0] == 0 else helper.all_to_all(v, True, seq_dim=1, heads_dim=v_heads_dim)
-            
+            v_ = (
+                v
+                if v.shape[0] == 0
+                else helper.all_to_all(v, True, seq_dim=1, heads_dim=v_heads_dim)
+            )
+
             output, softmax_aux, rng_state = FusedAttnFwdPrimitive.impl(
                 q_,
                 k_,
@@ -1707,7 +1711,7 @@ def impl(
                 _kv_segment_pos,
                 config=helper.get_step_config(),
             )
-            
+
             # Apply all-to-all to transform from heads-sharded to seq-sharded (scatter in seq dimension)
             # output is always [b, s, h/cp, d] -> heads_dim=2
             output = helper.all_to_all(output, False, seq_dim=1, heads_dim=2)
@@ -1742,23 +1746,24 @@ def partition(config, mesh, arg_infos, result_infos):
         dk_sharding = result_infos[1].sharding
         dv_sharding = result_infos[2].sharding
         dbias_sharding = result_infos[3].sharding
-        
+
         # For AllToAll context parallel, output and doutput need to be seq-sharded
         # to match the forward output sharding (before they get transformed to heads-sharded)
         arg_shardings = list([arg_i.sharding for arg_i in arg_infos])
         # arg_infos: [q, k, v, bias, softmax_aux, rng_state, output, doutput, q_seqlen, ...]
         # output is at index 6, doutput is at index 7
         # They should have the same sharding as the forward output (seq-sharded on cp axis)
         output_seq_sharding = NamedSharding(mesh, PartitionSpec(None, config.cp_axis, None, None))
-        softmax_aux_seq_sharding = NamedSharding(mesh, PartitionSpec(None, None, config.cp_axis, None))
+        softmax_aux_seq_sharding = NamedSharding(
+            mesh, PartitionSpec(None, None, config.cp_axis, None)
+        )
         arg_shardings[4] = softmax_aux_seq_sharding  # softmax_aux [b, h, s/cp, 1]
         arg_shardings[6] = output_seq_sharding  # output [b, s/cp, h, d]
         arg_shardings[7] = output_seq_sharding  # doutput [b, s/cp, h, d]
         arg_shardings = tuple(arg_shardings)
-        
+
         out_shardings = (dq_sharding, dk_sharding, dv_sharding, dbias_sharding)
-        
-        
+
         def impl(
             q,
             k,
@@ -1785,18 +1790,22 @@ def impl(
             cp_size = get_mesh_axis_size(config.cp_axis, mesh)
             assert q_heads % cp_size == 0, "q_heads must be divisible by cp_size"
             assert k_heads % cp_size == 0, "k_heads must be divisible by cp_size"
-            
+
             # Load balanced causal attention is not yet supported for all-to-all strategy
             if config.context_parallel_load_balanced:
                 raise NotImplementedError(
                     "context_parallel_load_balanced is not supported with all-to-all strategy"
                 )
-            
+
             # Apply all-to-all to transform from seq-sharded to heads-sharded (gather in seq dimension)
             q_ = helper.all_to_all(q, True, seq_dim=1, heads_dim=q_heads_dim)
             k_ = helper.all_to_all(k, True, seq_dim=1, heads_dim=k_heads_dim)
             # For KVPACKED layout, v is empty placeholder
-            v_ = v if v.shape[0] == 0 else helper.all_to_all(v, True, seq_dim=1, heads_dim=v_heads_dim)
+            v_ = (
+                v
+                if v.shape[0] == 0
+                else helper.all_to_all(v, True, seq_dim=1, heads_dim=v_heads_dim)
+            )
             # doutput is always separate [b, s, h, d], so heads_dim=2
             doutput_ = helper.all_to_all(doutput, True, seq_dim=1, heads_dim=2)
             # output has the same shape as doutput, needs the same transformation
@@ -1824,15 +1833,19 @@ def impl(
                 _kv_segment_pos,
                 config=helper.get_step_config(),
             )
-            
+
             # Apply all-to-all to gradients to restore original sharding (scatter in seq dimension)
             # Gradients have the same shape as inputs, so use same heads_dim
             dq_heads_dim, dk_heads_dim, dv_heads_dim = helper.get_qkv_heads_dims(seq_dim=1)
-            
+
             dq_ = helper.all_to_all(dq, False, seq_dim=1, heads_dim=dq_heads_dim)
             dk_ = helper.all_to_all(dk, False, seq_dim=1, heads_dim=dk_heads_dim)
             # For KVPACKED layout, dv is empty placeholder
-            dv_ = dv if dv.shape[0] == 0 else helper.all_to_all(dv, False, seq_dim=1, heads_dim=dv_heads_dim)
+            dv_ = (
+                dv
+                if dv.shape[0] == 0
+                else helper.all_to_all(dv, False, seq_dim=1, heads_dim=dv_heads_dim)
+            )
 
             return dq_, dk_, dv_, dbias
 
@@ -1870,19 +1883,19 @@ def check_supported(self):
     def get_qkv_heads_dims(self, seq_dim=1):
         """
         Determines the heads dimension indices for Q, K, V tensors based on QKV layout.
-        
+
         The heads dimension position depends on the QKV packing format:
         - QKVPacked: All tensors packed together with dimension [qkv=3, heads, dim]
         - KVPacked: Q is separate, K and V are packed with dimension [kv=2, heads, dim]
         - Separate: All tensors are separate with dimension [heads, dim]
-        
+
         Args:
             seq_dim: The sequence dimension position (default 1 for BSHD format)
-        
+
         Returns:
             Tuple of (q_heads_dim, k_heads_dim, v_heads_dim) indicating the position
             of the heads dimension for each tensor.
-        
+
         Examples for BSHD layout (seq_dim=1):
             QKVPacked: Q=[b, s, 3, h, d] -> returns (3, 3, 3)
             KVPacked:  Q=[b, s, h, d], K=[b, s, 2, h, d], V=[b, s, 2, h, d] -> returns (2, 3, 3)
@@ -1943,21 +1956,26 @@ def all_to_all(self, x, before_attn=True, seq_dim=1, heads_dim=2):
         # Reshape: insert cp_size at split_axis
         assert shape[split_axis] % cp_size == 0
         x = x.reshape(
-            *shape[:split_axis], cp_size, shape[split_axis] // cp_size, *shape[split_axis + 1:]
+            *shape[:split_axis], cp_size, shape[split_axis] // cp_size, *shape[split_axis + 1 :]
         )
 
         # Adjust concat_axis if needed (only for before_attn case)
         adjusted_concat_axis = concat_axis + 1 if needs_adjustment else concat_axis
 
         # Perform all-to-all
         x = lax_paral_op(
-            x, lax.all_to_all, self.config.cp_axis, mesh=self.mesh,
-            split_axis=split_axis, concat_axis=adjusted_concat_axis, tiled=True
+            x,
+            lax.all_to_all,
+            self.config.cp_axis,
+            mesh=self.mesh,
+            split_axis=split_axis,
+            concat_axis=adjusted_concat_axis,
+            tiled=True,
         )
 
         # Merge the two dimensions created by all-to-all at split_axis
         new_shape = list(x.shape)
-        new_shape[split_axis:split_axis + 2] = [x.shape[split_axis] * x.shape[split_axis + 1]]
+        new_shape[split_axis : split_axis + 2] = [x.shape[split_axis] * x.shape[split_axis + 1]]
         return x.reshape(new_shape)
 
     def get_step_config(self) -> _FusedAttnConfig: