Enable bitsandbytes quantization on AMD GPUs that use warp size 32 (#27307)

sstamenk · web-flow · commit 814843e021a3 · 2025-11-19T03:12:31.000Z
Signed-off-by: sstamenk &lt;strahinja.stamenkovic@amd.com&gt;
diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py
@@ -14,10 +14,13 @@
 from ...utils import compare_two_settings, multi_gpu_test
 from ..utils import check_embeddings_close, check_logprobs_close
 
-pytestmark = pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason="bitsandbytes quantization not supported on ROCm (CUDA-only kernels)",
-)
+if current_platform.is_rocm():
+    from vllm.platforms.rocm import on_gfx9
+
+    pytestmark = pytest.mark.skipif(
+        on_gfx9(),
+        reason="bitsandbytes not supported on gfx9 (warp size 64 limitation)",
+    )
 
 models_4bit_to_test = [
     ("facebook/opt-125m", "quantize opt model inflight"),
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -185,6 +185,9 @@ class RocmPlatform(Platform):
         "petit_nvfp4",
         "torchao",
     ]
+    # bitsandbytes not supported on gfx9 (warp size 64 limitation)
+    if not on_gfx9():
+        supported_quantization += ["bitsandbytes"]
 
     @classmethod
     def get_vit_attn_backend(

Original file line number	Diff line number	Diff line change
`@@ -185,6 +185,9 @@ class RocmPlatform(Platform):`
`185`	`185`	`"petit_nvfp4",`
`186`	`186`	`"torchao",`
`187`	`187`	`]`
	`188`	`+ # bitsandbytes not supported on gfx9 (warp size 64 limitation)`
	`189`	`+ if not on_gfx9():`
	`190`	`+ supported_quantization += ["bitsandbytes"]`
`188`	`191`
`189`	`192`	`@classmethod`
`190`	`193`	`def get_vit_attn_backend(`