bypass fused_ln instatiation in architectures below sm_70 (#76628)

A-nnonymous · web-flow · commit 85088198802f · 2025-11-27T10:55:16.000+08:00
diff --git a/paddle/phi/kernels/legacy/gpu/ln_bwd_semi_cuda_kernel.cu b/paddle/phi/kernels/legacy/gpu/ln_bwd_semi_cuda_kernel.cu
@@ -115,7 +115,7 @@ void launch_(LaunchParams<BwdParams> &launch_params,  // NOLINT
 // Create backward launch function and register. Macro signature:
 //  HIDDEN_SIZE, WTYPE, ITYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N,
 //  BYTES_PER_LDG, BYTES_PER_LDG_FINAL
-
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700)
 REGISTER_BWD_LAUNCHER(768, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
 REGISTER_BWD_LAUNCHER(768, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
 REGISTER_BWD_LAUNCHER(768, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
@@ -271,3 +271,4 @@ REGISTER_BWD_LAUNCHER(65536, fp16, fp16, fp16, fp32, 8, 1, 8, 16, 4);
 REGISTER_BWD_LAUNCHER(65536, fp16, fp32, fp16, fp32, 8, 1, 8, 16, 4);
 REGISTER_BWD_LAUNCHER(65536, bf16, bf16, bf16, fp32, 8, 1, 8, 16, 4);
 REGISTER_BWD_LAUNCHER(65536, bf16, fp32, bf16, fp32, 8, 1, 8, 16, 4);
+#endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700)
diff --git a/paddle/phi/kernels/legacy/gpu/ln_fwd_cuda_kernel.cu b/paddle/phi/kernels/legacy/gpu/ln_fwd_cuda_kernel.cu
@@ -100,7 +100,7 @@ void launch_(LaunchParams<FwdParams> &launch_params,  // NOLINT
 // Create forward launch function and register. Macro signature:
 //  HIDDEN_SIZE, WTYPE, ITYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N,
 //  BYTES_PER_LDG
-
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700)
 REGISTER_FWD_LAUNCHER(768, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
 REGISTER_FWD_LAUNCHER(768, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
 REGISTER_FWD_LAUNCHER(768, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
@@ -256,3 +256,4 @@ REGISTER_FWD_LAUNCHER(65536, fp16, fp16, fp16, fp32, 8, 1, 4, 16);
 REGISTER_FWD_LAUNCHER(65536, fp16, fp32, fp16, fp32, 8, 1, 4, 16);
 REGISTER_FWD_LAUNCHER(65536, bf16, bf16, bf16, fp32, 8, 1, 4, 16);
 REGISTER_FWD_LAUNCHER(65536, bf16, fp32, bf16, fp32, 8, 1, 4, 16);
+#endif  // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700)