diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index a7c8cffffd1..5ce84dd705b 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -661,6 +661,11 @@ class ComputeGraph final { inline bool device_is_adreno() { return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO; } + + inline bool device_is_mali() { + return context_->adapter_ptr()->device_type() == vkapi::DeviceType::MALI; + } + const std::string& device_name() { return context()->adapter_ptr()->device_name(); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_pw.glsl index d408b7ca9b8..fc063579c45 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_pw.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_pw.glsl @@ -55,13 +55,14 @@ layout(push_constant) uniform restrict Block { int input_zp; float output_inv_scale; int output_zp; + int K4_per_group; + int OC4_per_group; }; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; ${layout_declare_spec_const(C, "int", "apply_bias", "1")} ${layout_declare_spec_const(C, "int", "activation_type", "0")} -${layout_declare_spec_const(C, "int", "conv2d_params_K4_per_group", "1")} // Layout specialization constants ${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")} @@ -124,12 +125,18 @@ void main() { } } - // Compute initial input tile index - // Input has same spatial layout, channel dimension iterates from 0 - int input_idx = oh * inp_h_stride + ow_block_idx * inp_w_stride; + // Compute group index from output channel block + const int group_idx = oc_block_idx / OC4_per_group; + + // Compute initial input tile index with group offset + // For grouped im2col, each group's K range starts at group_idx * K4_per_group + // For non-grouped (groups=1), group_idx is always 0 so offset is 0 + int input_idx = oh * inp_h_stride + + ow_block_idx * inp_w_stride + + group_idx * K4_per_group; // Main accumulation loop over K dimension - for (int k4 = 0; k4 < conv2d_params_K4_per_group; k4++) { + for (int k4 = 0; k4 < K4_per_group; k4++) { // Load packed int8 input tile (TILE_M4=1, TILE_K4=1) // Each int contains 4 packed int8s (one per width position in the tile) ivec4 int8_input_tile = t_packed_int8_input[input_idx]; diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp index d1a4840fbba..f6e89bef03d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp @@ -430,12 +430,21 @@ void q8ta_conv2d(ComputeGraph& graph, const std::vector& args) { const int64_t W_out = graph.size_at(-1, output); const int64_t spatial_out = H_out * W_out; - // Use im2col when the channel depth is sufficient for tiled GEMM to win, or - // when the output spatial area is small enough that the im2col buffer stays - // manageable. For large spatial outputs with few channels, the im2col buffer - // becomes too large and the general shader is more efficient. - const bool use_im2col = groups == 1 && in_channels_per_group % 4 == 0 && - (in_channels_per_group >= 64 || spatial_out <= 4096); + // Im2col requires input channels per group to be a multiple of 4 + const bool im2col_eligible = in_channels_per_group % 4 == 0; + + bool use_im2col = false; + if (graph.device_is_mali()) { + // On Mali, im2col is faster than the general shader across the board. + use_im2col = im2col_eligible; + } else { + // Default: on Adreno and unknown GPU architectures, im2col is only + // beneficial for ungrouped convolutions with sufficient channel depth or + // small spatial output. For grouped convolutions, the general shader is + // more efficient (0.7-0.95x regression measured on Adreno). + use_im2col = im2col_eligible && groups == 1 && + (in_channels_per_group >= 32 || spatial_out <= 4096); + } if (use_im2col) { q8ta_conv2d_im2col(graph, args); diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h index 2779a7445a8..6da98fbef74 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h @@ -121,7 +121,27 @@ void add_q8ta_conv2d_pw_node( const ValueRef bias_data, const ValueRef packed_bias, const uint32_t activation_type, - const ValueRef packed_int8_output); + const ValueRef packed_int8_output, + const int32_t groups = 1); + +std::vector calculate_q8ta_im2col_sizes( + ComputeGraph* graph, + const ValueRef& input, + const ValueRef& output, + const ValueRef& kernel_size, + const ValueRef& groups); + +void add_q8ta_im2col_node( + ComputeGraph& graph, + const ValueRef packed_int8_input, + const ValueRef kernel_size, + const ValueRef stride, + const ValueRef padding, + const ValueRef dilation, + const ValueRef groups, + const ValueRef packed_int8_output, + const ValueRef packed_int8_im2col, + const int32_t zp); void q8ta_conv2d_im2col(ComputeGraph& graph, const std::vector& args); diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp index 161b5e8fc24..b43fe9eacc6 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp @@ -127,9 +127,8 @@ void add_q8ta_im2col_node( dilation, groups); - // At the moment, the im2col path only supports non-grouped convolutions - VK_CHECK_COND(conv_params.groups == 1); - // The implementation also requires that input channels is a multiple of 4 + // The implementation requires that input channels per group is a multiple of + // 4 VK_CHECK_COND(conv_params.in_channels_per_group % 4 == 0); std::string kernel_name = "q8ta_im2col"; @@ -257,6 +256,8 @@ void q8ta_conv2d_im2col( zp); // Step 2: Perform pointwise convolution on the im2col result + const int32_t groups_val = graph.extract_scalar(groups); + add_q8ta_conv2d_pw_node( graph, packed_int8_im2col, @@ -270,7 +271,8 @@ void q8ta_conv2d_im2col( bias_data, packed_bias, activation_type_val, - packed_int8_output); + packed_int8_output, + groups_val); } REGISTER_OPERATORS { diff --git a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp index 1872e8796de..e27e0699dac 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp @@ -200,39 +200,40 @@ void add_q8ta_conv2d_pw_node( const ValueRef bias_data, const ValueRef packed_bias, const uint32_t activation_type, - const ValueRef packed_int8_output) { - // Validate packed dim info for input and output tensors - // To maximize performance, the input tensor must be in 4W4C layout + const ValueRef packed_int8_output, + const int32_t groups) { VK_CHECK_COND(q8ta_conv2d_check_4w4c_packed_dim_info( graph.packed_dim_info_of(packed_int8_input))); - // However, the requirements for output tensor layout is flexible VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info( graph.packed_dim_info_of(packed_int8_output))); - // Validate dtype is kInt8x4 VK_CHECK_COND(graph.dtype_of(packed_int8_input) == vkapi::kInt8x4); VK_CHECK_COND(graph.dtype_of(packed_int8_output) == vkapi::kInt8x4); + // Compute K4_per_group and OC4_per_group from tensor dimensions and groups + // Input K dim (dim -3) = K_per_group * groups for grouped im2col, or IC for + // non-grouped. Either way, K4_per_group = div_up_4(K_dim / groups). + const int32_t K_dim = graph.size_at(-3, packed_int8_input); + const int32_t OC = graph.size_at(-3, packed_int8_output); + const int32_t K4_per_group = + static_cast(utils::div_up_4(K_dim / groups)); + const int32_t OC4_per_group = + static_cast(utils::div_up_4(OC / groups)); + float input_scale_val = graph.extract_scalar(input_scale); int32_t input_zp_val = graph.extract_scalar(input_zp); float output_inv_scale_val = 1.0f / graph.extract_scalar(output_scale); int32_t output_zp_val = graph.extract_scalar(output_zp); - uint32_t apply_bias = 1; - if (graph.val_is_none(bias_data)) { - apply_bias = 0; - } - - // Get input channel count for K4_per_group - const uint32_t IC = graph.size_at(-3, packed_int8_input); - const uint32_t K4_per_group = utils::div_up_4(IC); - + uint32_t apply_bias = graph.val_is_none(bias_data) ? 0u : 1u; std::vector push_constants = { PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)), PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)), PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)), PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)), + PushConstantDataInfo(&K4_per_group, sizeof(K4_per_group)), + PushConstantDataInfo(&OC4_per_group, sizeof(OC4_per_group)), }; const bool use_hw_dot = @@ -241,17 +242,13 @@ void add_q8ta_conv2d_pw_node( use_hw_dot ? "q8ta_conv2d_pw" : "q8ta_conv2d_pw_fallback"; add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales)); - // Pass metadata for both output and input tensors vkapi::ParamsBindList param_buffers = { graph.buffer_meta_ubo(packed_int8_output), graph.buffer_meta_ubo(packed_int8_input)}; - // Build spec constants: apply_bias, activation_type + layout constants vkapi::SpecVarList spec_constants = { apply_bias, activation_type, - K4_per_group, - // Layout specialization constants graph.hashed_layout_of(packed_int8_output), graph.hashed_layout_of(packed_int8_input), }; @@ -261,7 +258,6 @@ void add_q8ta_conv2d_pw_node( VK_KERNEL_FROM_STR(kernel_name), pick_q8ta_conv2d_pw_global_wg_size, pick_q8ta_conv2d_pw_local_wg_size, - // Inputs and Outputs {{packed_int8_output, vkapi::kWrite}, {{packed_int8_input, packed_weight, @@ -269,13 +265,9 @@ void add_q8ta_conv2d_pw_node( packed_weight_scales, packed_bias}, vkapi::kRead}}, - // Shader params buffers param_buffers, - // Push Constants push_constants, - // Specialization Constants spec_constants, - // Resize args {})); } diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp index ef3d57be90d..dbe8a73651c 100644 --- a/backends/vulkan/runtime/vk_api/Device.cpp +++ b/backends/vulkan/runtime/vk_api/Device.cpp @@ -132,7 +132,9 @@ PhysicalDevice::PhysicalDevice( device_type = DeviceType::SWIFTSHADER; } else if (device_name.find("nvidia") != std::string::npos) { device_type = DeviceType::NVIDIA; - } else if (device_name.find("mali") != std::string::npos) { + } else if ( + device_name.find("mali") != std::string::npos || + device_name.find("immortalis") != std::string::npos) { device_type = DeviceType::MALI; } } diff --git a/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp b/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp index 41ddd389aa8..9f0273a5b83 100644 --- a/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp +++ b/backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp @@ -237,9 +237,9 @@ std::vector generate_quantized_conv2d_easy_cases() { test_cases.push_back(create_test_case_from_config( config, vkapi::kFloat, fp_storage_type, int8_memory_layout)); - // Test im2col implementation for non-grouped convolutions with input - // channels that are a multiple of 4 and stride_w == 1 - if (config.groups == 1 && config.channels.in % 4 == 0) { + // Test im2col implementation when input channels per group is a + // multiple of 4 + if ((config.channels.in / config.groups) % 4 == 0) { test_cases.push_back(create_test_case_from_config( config, vkapi::kFloat, @@ -379,6 +379,21 @@ static std::vector generate_quantized_conv2d_test_cases() { Padding(2, 2), Dilation(1, 1), 4}, + // SceneX v9 grouped convolutions (large spatial) + {OutInChannels(128, 128), + InputSize2D(256, 256), + KernelSize(5, 5), + Stride(2, 2), + Padding(2, 2), + Dilation(1, 1), + 4}, + {OutInChannels(64, 64), + InputSize2D(256, 256), + KernelSize(3, 3), + Stride(1, 1), + Padding(1, 1), + Dilation(1, 1), + 2}, // Deep channels + small spatial (ResNet50 stage 5 bottleneck) {OutInChannels(512, 512), InputSize2D(7, 7), @@ -426,9 +441,11 @@ static std::vector generate_quantized_conv2d_test_cases() { int8_memory_layout, /*impl_selector=*/"general")); - // Test im2col implementation for non-grouped convolutions with input - // channels that are a multiple of 4 and stride_w == 1 - if (config.groups == 1 && config.channels.in % 4 == 0) { + // Test im2col implementation when input channels per group is a + // multiple of 4 + const int64_t in_channels_per_group = + config.channels.in / config.groups; + if (in_channels_per_group % 4 == 0) { test_cases.push_back(create_test_case_from_config( config, vkapi::kFloat,