pytorch · meta-codesync · Mar 3, 2026 · Mar 2, 2026
@@ -661,6 +661,11 @@ class ComputeGraph final {
   inline bool device_is_adreno() {
     return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO;
   }
+
+  inline bool device_is_mali() {
+    return context_->adapter_ptr()->device_type() == vkapi::DeviceType::MALI;
+  }
+
   const std::string& device_name() {
     return context()->adapter_ptr()->device_name();
   }

@@ -55,13 +55,14 @@ layout(push_constant) uniform restrict Block {
   int input_zp;
   float output_inv_scale;
   int output_zp;
+  int K4_per_group;
+  int OC4_per_group;
 };
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 ${layout_declare_spec_const(C, "int", "apply_bias", "1")}
 ${layout_declare_spec_const(C, "int", "activation_type", "0")}
-${layout_declare_spec_const(C, "int", "conv2d_params_K4_per_group", "1")}
 
 // Layout specialization constants
 ${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
@@ -124,12 +125,18 @@ void main() {
     }
   }
 
-  // Compute initial input tile index
-  // Input has same spatial layout, channel dimension iterates from 0
-  int input_idx = oh * inp_h_stride + ow_block_idx * inp_w_stride;
+  // Compute group index from output channel block
+  const int group_idx = oc_block_idx / OC4_per_group;
+
+  // Compute initial input tile index with group offset
+  // For grouped im2col, each group's K range starts at group_idx * K4_per_group
+  // For non-grouped (groups=1), group_idx is always 0 so offset is 0
+  int input_idx = oh * inp_h_stride
+                + ow_block_idx * inp_w_stride
+                + group_idx * K4_per_group;
 
   // Main accumulation loop over K dimension
-  for (int k4 = 0; k4 < conv2d_params_K4_per_group; k4++) {
+  for (int k4 = 0; k4 < K4_per_group; k4++) {
     // Load packed int8 input tile (TILE_M4=1, TILE_K4=1)
     // Each int contains 4 packed int8s (one per width position in the tile)
     ivec4 int8_input_tile = t_packed_int8_input[input_idx];

@@ -430,12 +430,21 @@ void q8ta_conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   const int64_t W_out = graph.size_at<int64_t>(-1, output);
   const int64_t spatial_out = H_out * W_out;
 
-  // Use im2col when the channel depth is sufficient for tiled GEMM to win, or
-  // when the output spatial area is small enough that the im2col buffer stays
-  // manageable. For large spatial outputs with few channels, the im2col buffer
-  // becomes too large and the general shader is more efficient.
-  const bool use_im2col = groups == 1 && in_channels_per_group % 4 == 0 &&
-      (in_channels_per_group >= 64 || spatial_out <= 4096);
+  // Im2col requires input channels per group to be a multiple of 4
+  const bool im2col_eligible = in_channels_per_group % 4 == 0;
+
+  bool use_im2col = false;
+  if (graph.device_is_mali()) {
+    // On Mali, im2col is faster than the general shader across the board.
+    use_im2col = im2col_eligible;
+  } else {
+    // Default: on Adreno and unknown GPU architectures, im2col is only
+    // beneficial for ungrouped convolutions with sufficient channel depth or
+    // small spatial output. For grouped convolutions, the general shader is
+    // more efficient (0.7-0.95x regression measured on Adreno).
+    use_im2col = im2col_eligible && groups == 1 &&
+        (in_channels_per_group >= 32 || spatial_out <= 4096);
+  }
 
   if (use_im2col) {
     q8ta_conv2d_im2col(graph, args);

@@ -121,7 +121,27 @@ void add_q8ta_conv2d_pw_node(
     const ValueRef bias_data,
     const ValueRef packed_bias,
     const uint32_t activation_type,
-    const ValueRef packed_int8_output);
+    const ValueRef packed_int8_output,
+    const int32_t groups = 1);
+
+std::vector<int64_t> calculate_q8ta_im2col_sizes(
+    ComputeGraph* graph,
+    const ValueRef& input,
+    const ValueRef& output,
+    const ValueRef& kernel_size,
+    const ValueRef& groups);
+
+void add_q8ta_im2col_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef packed_int8_output,
+    const ValueRef packed_int8_im2col,
+    const int32_t zp);
 
 void q8ta_conv2d_im2col(ComputeGraph& graph, const std::vector<ValueRef>& args);
 

@@ -127,9 +127,8 @@ void add_q8ta_im2col_node(
       dilation,
       groups);
 
-  // At the moment, the im2col path only supports non-grouped convolutions
-  VK_CHECK_COND(conv_params.groups == 1);
-  // The implementation also requires that input channels is a multiple of 4
+  // The implementation requires that input channels per group is a multiple of
+  // 4
   VK_CHECK_COND(conv_params.in_channels_per_group % 4 == 0);
 
   std::string kernel_name = "q8ta_im2col";
@@ -257,6 +256,8 @@ void q8ta_conv2d_im2col(
       zp);
 
   // Step 2: Perform pointwise convolution on the im2col result
+  const int32_t groups_val = graph.extract_scalar<int32_t>(groups);
+
   add_q8ta_conv2d_pw_node(
       graph,
       packed_int8_im2col,
@@ -270,7 +271,8 @@ void q8ta_conv2d_im2col(
       bias_data,
       packed_bias,
       activation_type_val,
-      packed_int8_output);
+      packed_int8_output,
+      groups_val);
 }
 
 REGISTER_OPERATORS {

@@ -200,39 +200,40 @@ void add_q8ta_conv2d_pw_node(
     const ValueRef bias_data,
     const ValueRef packed_bias,
     const uint32_t activation_type,
-    const ValueRef packed_int8_output) {
-  // Validate packed dim info for input and output tensors
-  // To maximize performance, the input tensor must be in 4W4C layout
+    const ValueRef packed_int8_output,
+    const int32_t groups) {
   VK_CHECK_COND(q8ta_conv2d_check_4w4c_packed_dim_info(
       graph.packed_dim_info_of(packed_int8_input)));
-  // However, the requirements for output tensor layout is flexible
   VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info(
       graph.packed_dim_info_of(packed_int8_output)));
 
-  // Validate dtype is kInt8x4
   VK_CHECK_COND(graph.dtype_of(packed_int8_input) == vkapi::kInt8x4);
   VK_CHECK_COND(graph.dtype_of(packed_int8_output) == vkapi::kInt8x4);
 
+  // Compute K4_per_group and OC4_per_group from tensor dimensions and groups
+  // Input K dim (dim -3) = K_per_group * groups for grouped im2col, or IC for
+  // non-grouped. Either way, K4_per_group = div_up_4(K_dim / groups).
+  const int32_t K_dim = graph.size_at<int32_t>(-3, packed_int8_input);
+  const int32_t OC = graph.size_at<int32_t>(-3, packed_int8_output);
+  const int32_t K4_per_group =
+      static_cast<int32_t>(utils::div_up_4(K_dim / groups));
+  const int32_t OC4_per_group =
+      static_cast<int32_t>(utils::div_up_4(OC / groups));
+
   float input_scale_val = graph.extract_scalar<float>(input_scale);
   int32_t input_zp_val = graph.extract_scalar<int32_t>(input_zp);
 
   float output_inv_scale_val = 1.0f / graph.extract_scalar<float>(output_scale);
   int32_t output_zp_val = graph.extract_scalar<int32_t>(output_zp);
 
-  uint32_t apply_bias = 1;
-  if (graph.val_is_none(bias_data)) {
-    apply_bias = 0;
-  }
-
-  // Get input channel count for K4_per_group
-  const uint32_t IC = graph.size_at<uint32_t>(-3, packed_int8_input);
-  const uint32_t K4_per_group = utils::div_up_4(IC);
-
+  uint32_t apply_bias = graph.val_is_none(bias_data) ? 0u : 1u;
   std::vector<PushConstantDataInfo> push_constants = {
       PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)),
       PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)),
       PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
       PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
+      PushConstantDataInfo(&K4_per_group, sizeof(K4_per_group)),
+      PushConstantDataInfo(&OC4_per_group, sizeof(OC4_per_group)),
   };
 
   const bool use_hw_dot =
@@ -241,17 +242,13 @@ void add_q8ta_conv2d_pw_node(
       use_hw_dot ? "q8ta_conv2d_pw" : "q8ta_conv2d_pw_fallback";
   add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales));
 
-  // Pass metadata for both output and input tensors
   vkapi::ParamsBindList param_buffers = {
       graph.buffer_meta_ubo(packed_int8_output),
       graph.buffer_meta_ubo(packed_int8_input)};
 
-  // Build spec constants: apply_bias, activation_type + layout constants
   vkapi::SpecVarList spec_constants = {
       apply_bias,
       activation_type,
-      K4_per_group,
-      // Layout specialization constants
       graph.hashed_layout_of(packed_int8_output),
       graph.hashed_layout_of(packed_int8_input),
   };
@@ -261,21 +258,16 @@ void add_q8ta_conv2d_pw_node(
       VK_KERNEL_FROM_STR(kernel_name),
       pick_q8ta_conv2d_pw_global_wg_size,
       pick_q8ta_conv2d_pw_local_wg_size,
-      // Inputs and Outputs
       {{packed_int8_output, vkapi::kWrite},
        {{packed_int8_input,
          packed_weight,
          packed_weight_sums,
          packed_weight_scales,
          packed_bias},
         vkapi::kRead}},
-      // Shader params buffers
       param_buffers,
-      // Push Constants
       push_constants,
-      // Specialization Constants
       spec_constants,
-      // Resize args
       {}));
 }
 

@@ -132,7 +132,9 @@ PhysicalDevice::PhysicalDevice(
     device_type = DeviceType::SWIFTSHADER;
   } else if (device_name.find("nvidia") != std::string::npos) {
     device_type = DeviceType::NVIDIA;
-  } else if (device_name.find("mali") != std::string::npos) {
+  } else if (
+      device_name.find("mali") != std::string::npos ||
+      device_name.find("immortalis") != std::string::npos) {
     device_type = DeviceType::MALI;
   }
 }

@@ -237,9 +237,9 @@ std::vector<TestCase> generate_quantized_conv2d_easy_cases() {
       test_cases.push_back(create_test_case_from_config(
           config, vkapi::kFloat, fp_storage_type, int8_memory_layout));
 
-      // Test im2col implementation for non-grouped convolutions with input
-      // channels that are a multiple of 4 and stride_w == 1
-      if (config.groups == 1 && config.channels.in % 4 == 0) {
+      // Test im2col implementation when input channels per group is a
+      // multiple of 4
+      if ((config.channels.in / config.groups) % 4 == 0) {
         test_cases.push_back(create_test_case_from_config(
             config,
             vkapi::kFloat,
@@ -379,6 +379,21 @@ static std::vector<TestCase> generate_quantized_conv2d_test_cases() {
        Padding(2, 2),
        Dilation(1, 1),
        4},
+      // SceneX v9 grouped convolutions (large spatial)
+      {OutInChannels(128, 128),
+       InputSize2D(256, 256),
+       KernelSize(5, 5),
+       Stride(2, 2),
+       Padding(2, 2),
+       Dilation(1, 1),
+       4},
+      {OutInChannels(64, 64),
+       InputSize2D(256, 256),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       2},
       // Deep channels + small spatial (ResNet50 stage 5 bottleneck)
       {OutInChannels(512, 512),
        InputSize2D(7, 7),
@@ -426,9 +441,11 @@ static std::vector<TestCase> generate_quantized_conv2d_test_cases() {
             int8_memory_layout,
             /*impl_selector=*/"general"));
 
-        // Test im2col implementation for non-grouped convolutions with input
-        // channels that are a multiple of 4 and stride_w == 1
-        if (config.groups == 1 && config.channels.in % 4 == 0) {
+        // Test im2col implementation when input channels per group is a
+        // multiple of 4
+        const int64_t in_channels_per_group =
+            config.channels.in / config.groups;
+        if (in_channels_per_group % 4 == 0) {
           test_cases.push_back(create_test_case_from_config(
               config,
               vkapi::kFloat,