Skip TE test on SM120+ as Float8BlockScaling is currently unsupported in thunder (#2475)

kshitij12345 · riccardofelluga · web-flow · commit 014f555e3dc1 · 2025-08-29T14:47:38.000Z
Co-authored-by: Riccardo Felluga &lt;11768013+riccardofelluga@users.noreply.github.com&gt;
diff --git a/thunder/tests/test_transformer_engine_executor.py b/thunder/tests/test_transformer_engine_executor.py
@@ -5,6 +5,10 @@
 import thunder
 from thunder.tests.framework import requiresCUDA
 
+# NOTE: On SM120/121, TE defaults to using Float8BlockScaling
+#       which is currently unsupported in thunder, we skip the tests for these SM architectures.
+from thunder.tests.utils import skip_on_sm120_and_sm121, is_sm120_orsm121
+
 pytest.importorskip("transformer_engine", reason="transformer_engine was not found, skipping the tests.")
 from thunder.executors.transformer_engineex import transformer_engine_ex
 from transformer_engine.common import recipe
@@ -32,6 +36,9 @@ def test_te_linear_forward_backward(fp8_recipe: recipe.Recipe):
     if fp8_recipe and not (fp8_recipe.delayed() or is_mxfp8_supported):
         pytest.skip(msg_mxfp8)
 
+    if is_sm120_orsm121 and fp8_recipe is None:
+        pytest.skip("On SM120/121, default recipe is Float8BlockScaling which is not supported")
+
     # Test Description:
     # Verify that `torch.nn.functional.linear` is replaced with `te_linear_*`
     # and the output as well as the gradients match for thunder compiled code.
@@ -89,6 +96,9 @@ def test_te_linear_forward_backward_multiple_iteration(fp8_recipe):
     if fp8_recipe and not (fp8_recipe.delayed() or is_mxfp8_supported):
         pytest.skip(msg_mxfp8)
 
+    if is_sm120_orsm121 and fp8_recipe is None:
+        pytest.skip("On SM120/121, default recipe is Float8BlockScaling which is not supported")
+
     # Test Description:
     # In this test, we verify whether a model using TransformerEngine Linear
     # and transformer_engine executor converge to same state.
@@ -161,6 +171,7 @@ def thunder_model(x):
 
 
 @requiresCUDA
+@skip_on_sm120_and_sm121
 def test_te_linear_invalid_inputs():
     def assert_not_transformed(x, w):
         def fn(x, w):
@@ -185,6 +196,7 @@ def fn(x, w):
 
 
 @requiresCUDA
+@skip_on_sm120_and_sm121
 def test_te_with_autocast():
     from thunder.transforms.autocast import autocast
 
@@ -215,6 +227,7 @@ def foo(x, w):
     reason="See https://github.com/Lightning-AI/lightning-thunder/issues/2221",
 )
 @requiresCUDA
+@skip_on_sm120_and_sm121
 def test_te_with_retain_graph():
     def foo(x, w):
         return thunder.torch.linear(x, w)
@@ -236,6 +249,7 @@ def foo(x, w):
 
 
 @requiresCUDA
+@skip_on_sm120_and_sm121
 def test_te_trace_metadata_propagation():
     # This test is to verify that we correctly propagate metadata `_include_te_fp8_autocast` on
     # trace using `from_trace`. `_include_te_fp8_autocast` is used to enable wrapping forward trace with `fp8_autocast`.
@@ -267,6 +281,7 @@ def transform_trace_post_optimization(self, computation_trace, **kwargs):
     assert any(bsym.sym.name.startswith("te_linear") for bsym in fwd_traces[-1].bound_symbols)
 
 
+@skip_on_sm120_and_sm121
 def test_te_grad_computation_with_intermediate():
     # Test for issue - https://github.com/Lightning-AI/lightning-thunder/issues/1966
     def fn(x, w):
diff --git a/thunder/tests/test_transformer_engine_v2_executor.py b/thunder/tests/test_transformer_engine_v2_executor.py
@@ -7,6 +7,10 @@
 import thunder
 from thunder.tests.framework import requiresCUDA
 
+# NOTE: On SM120/121, TE defaults to using Float8BlockScaling
+#       which is currently unsupported in thunder, we skip the tests for these SM architectures.
+from thunder.tests.utils import skip_on_sm120_and_sm121, is_sm120_orsm121
+
 transformer_engine_module = pytest.importorskip(
     "transformer_engine", reason="transformer_engine was not found, skipping the tests."
 )
@@ -33,10 +37,14 @@
 
 @requiresCUDA
 @pytest.mark.parametrize("fp8_recipe", recipes, ids=recipe_ids)
+@skip_on_sm120_and_sm121
 def test_te_linear_forward_backward(fp8_recipe: recipe.Recipe):
     if fp8_recipe and not (fp8_recipe.delayed() or is_mxfp8_supported):
         pytest.skip(msg_mxfp8)
 
+    if is_sm120_orsm121 and fp8_recipe is None:
+        pytest.skip("On SM120/121, default recipe is Float8BlockScaling which is not supported")
+
     # Test Description:
     # Verify that `torch.nn.functional.linear` is replaced with `te_linear_*`
     # and the output as well as the gradients match for thunder compiled code.
@@ -96,6 +104,7 @@ def fn(x, w1, w2):
 
 @requiresCUDA
 @pytest.mark.parametrize("fp8_recipe", recipes, ids=recipe_ids)
+@skip_on_sm120_and_sm121
 def test_te_linear_forward_backward_multiple_iteration(fp8_recipe: recipe.Recipe):
     if not fp8_recipe:
         pytest.skip(
@@ -277,6 +286,7 @@ def fn(x, w):
 
 
 @requiresCUDA
+@skip_on_sm120_and_sm121
 def test_te_with_autocast():
     from thunder.transforms.autocast import autocast
 
@@ -303,6 +313,7 @@ def foo(x, w):
 # NOTE: strict=False as it passes on Blackwell.
 @pytest.mark.xfail(strict=False, raises=(RuntimeError, TypeError), reason="Retain graph is not supported by TE")
 @requiresCUDA
+@skip_on_sm120_and_sm121
 def test_te_with_retain_graph():
     def foo(x, w):
         return thunder.torch.linear(x, w)
@@ -325,6 +336,7 @@ def foo(x, w):
 
 
 @requiresCUDA
+@skip_on_sm120_and_sm121
 def test_te_trace_metadata_propagation():
     # This test is to verify that we correctly propagate metadata `_include_te_fp8_autocast` on
     # trace using `from_trace`. `_include_te_fp8_autocast` is used to enable wrapping forward trace with `fp8_autocast`.
@@ -357,6 +369,7 @@ def transform_trace_post_optimization(self, computation_trace, **kwargs):
     assert any(bsym.sym.name.startswith("te_functional_linear") for bsym in fwd_traces[-1].bound_symbols)
 
 
+@skip_on_sm120_and_sm121
 def test_te_grad_computation_with_intermediate():
     # Test for issue - https://github.com/Lightning-AI/lightning-thunder/issues/1966
     def fn(x, w):
@@ -381,6 +394,7 @@ def fn(x, w):
 
 @requiresCUDA
 @pytest.mark.parametrize("fp8_recipe", recipes, ids=recipe_ids)
+@skip_on_sm120_and_sm121
 def test_te_trace_correctness(fp8_recipe: recipe.Recipe):
     if fp8_recipe and not (fp8_recipe.delayed() or is_mxfp8_supported):
         pytest.skip(msg_mxfp8)
@@ -451,6 +465,7 @@ def foo(x, w):
 @requiresCUDA
 @pytest.mark.parametrize("fp8_recipe", recipes, ids=recipe_ids)
 @pytest.mark.parametrize("compile_path", ["jit", "ThunderFX"])
+@skip_on_sm120_and_sm121
 def test_te_activation_checkpointing_trace(fp8_recipe: recipe.Recipe, compile_path: str):
     if fp8_recipe and not (fp8_recipe.delayed() or is_mxfp8_supported):
         pytest.skip(msg_mxfp8)
@@ -505,6 +520,7 @@ def fn(x, w, w2):
 @pytest.mark.parametrize("fp8_recipe", recipes, ids=recipe_ids)
 @pytest.mark.parametrize("compile_path", ["jit", "ThunderFX"])
 @pytest.mark.filterwarnings("ignore::FutureWarning")  # Coming from TE v2.3
+@skip_on_sm120_and_sm121
 def test_te_activation_checkpointing_correctness(fp8_recipe: recipe.Recipe, compile_path: str):
     if not fp8_recipe:
         pytest.skip(
diff --git a/thunder/tests/utils.py b/thunder/tests/utils.py
@@ -1,4 +1,6 @@
 import torch
+import pytest
+import functools
 
 
 def is_output_differentiable(x):
@@ -36,3 +38,18 @@ def filter_differentiable_outputs(outputs):
         outputs = [outputs]
 
     return list(filter(is_output_differentiable, outputs))
+
+
+def is_sm120_orsm121():
+    return torch.cuda.get_device_capability() in ((12, 1), (12, 0))
+
+
+def skip_on_sm120_and_sm121(fn):
+    @functools.wraps(fn)
+    def wrapped_fn(*args, **kwargs):
+        if is_sm120_orsm121():
+            pytest.skip("Skipped on SM120/121")
+        else:
+            fn(*args, **kwargs)
+
+    return wrapped_fn