Fixed quantized sharding

greg-kwasniewski1 · lucaslie · commit 4b3cde4165a5 · 2025-10-29T17:11:05.000-07:00
Signed-off-by: greg-kwasniewski1 &lt;213329731+greg-kwasniewski1@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py
@@ -341,6 +341,7 @@ def _insert_sharded_mamba(
         add_dist=False,
         min_local_shape=min_local_shape,
         fused_weight_dims=entry_fused_dims,
+        quantization_cb=quantization_cb,
     )
 
     # Get all weight nodes in the subgraph except for out_proj
@@ -573,6 +574,20 @@ class WeightShardingInfo(ShardingTransformInfo):
     # used for TP sharding of fused weights
     fused_weight_dims: Optional[list] = None
 
+    def quantization_cb(
+        self,
+        gm: GraphModule,
+        submod: nn.Module,
+        node: Node,
+        weight_key: str,
+        weight_new_shape: torch.Size,
+        dim: int,
+        rank: int,
+        world_size: int,
+    ) -> None:
+        """Quantization callback. Default does nothing for non-quantized models."""
+        return None
+
     @classmethod
     def from_node(cls, node: Node, **kwargs) -> "WeightShardingInfo":
         """
@@ -612,6 +627,7 @@ def apply(self, gm: GraphModule, node: Node) -> None:
                 fused_weight_dims=self.fused_weight_dims
                 if isinstance(self.fused_weight_dims, dict)
                 else None,
+                quantization_cb=self.quantization_cb,
             )
         else:
             _shard_parameter_node(
@@ -623,6 +639,7 @@ def apply(self, gm: GraphModule, node: Node) -> None:
                 add_dist=self.dist_op is not None,
                 min_local_shape=self.min_local_shape,
                 fused_weight_dims=self.fused_weight_dims,
+                quantization_cb=self.quantization_cb,
             )
 
 
@@ -741,18 +758,6 @@ def shard_load_hook(
     ) -> None:
         return
 
-    def apply(self, gm: GraphModule, node: Node) -> None:
-        _shard_parameter_node(
-            gm=gm,
-            node=node,
-            dim=self.split_dim.value,
-            rank=self.rank,
-            world_size=self.world_size,
-            add_dist=self.dist_op is not None,
-            min_local_shape=self.min_local_shape,
-            quantization_cb=self.quantization_cb,  # quant callback
-        )
-
 
 def _shard_fp4_weight_scale(weight_scale, sharded_uint8_weight_shape, dim, rank, world_size):
     assert weight_scale.dim() == 1
@@ -809,18 +814,6 @@ def shard_load_hook(
                 state_dict[key], weight_shape, dim, rank, world_size
             )
 
-    def apply(self, gm: GraphModule, node: Node) -> None:
-        _shard_parameter_node(
-            gm=gm,
-            node=node,
-            dim=self.split_dim.value,
-            rank=self.rank,
-            world_size=self.world_size,
-            add_dist=self.dist_op is not None,
-            min_local_shape=self.min_local_shape,
-            quantization_cb=self.quantization_cb,  # quant callback
-        )
-
 
 TP_SHARDING_RULES = [
     (lambda n: is_op(n, torch.ops.auto_deploy.torch_fake_quant_fp8_linear), FP8TPShardingInfo),