Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions backends/vulkan/runtime/graph/ComputeGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,11 @@ class ComputeGraph final {
inline bool device_is_adreno() {
return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO;
}

inline bool device_is_mali() {
return context_->adapter_ptr()->device_type() == vkapi::DeviceType::MALI;
}

const std::string& device_name() {
return context()->adapter_ptr()->device_name();
}
Expand Down
17 changes: 12 additions & 5 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_pw.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,14 @@ layout(push_constant) uniform restrict Block {
int input_zp;
float output_inv_scale;
int output_zp;
int K4_per_group;
int OC4_per_group;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "apply_bias", "1")}
${layout_declare_spec_const(C, "int", "activation_type", "0")}
${layout_declare_spec_const(C, "int", "conv2d_params_K4_per_group", "1")}

// Layout specialization constants
${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
Expand Down Expand Up @@ -124,12 +125,18 @@ void main() {
}
}

// Compute initial input tile index
// Input has same spatial layout, channel dimension iterates from 0
int input_idx = oh * inp_h_stride + ow_block_idx * inp_w_stride;
// Compute group index from output channel block
const int group_idx = oc_block_idx / OC4_per_group;

// Compute initial input tile index with group offset
// For grouped im2col, each group's K range starts at group_idx * K4_per_group
// For non-grouped (groups=1), group_idx is always 0 so offset is 0
int input_idx = oh * inp_h_stride
+ ow_block_idx * inp_w_stride
+ group_idx * K4_per_group;

// Main accumulation loop over K dimension
for (int k4 = 0; k4 < conv2d_params_K4_per_group; k4++) {
for (int k4 = 0; k4 < K4_per_group; k4++) {
// Load packed int8 input tile (TILE_M4=1, TILE_K4=1)
// Each int contains 4 packed int8s (one per width position in the tile)
ivec4 int8_input_tile = t_packed_int8_input[input_idx];
Expand Down
21 changes: 15 additions & 6 deletions backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -430,12 +430,21 @@ void q8ta_conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
const int64_t W_out = graph.size_at<int64_t>(-1, output);
const int64_t spatial_out = H_out * W_out;

// Use im2col when the channel depth is sufficient for tiled GEMM to win, or
// when the output spatial area is small enough that the im2col buffer stays
// manageable. For large spatial outputs with few channels, the im2col buffer
// becomes too large and the general shader is more efficient.
const bool use_im2col = groups == 1 && in_channels_per_group % 4 == 0 &&
(in_channels_per_group >= 64 || spatial_out <= 4096);
// Im2col requires input channels per group to be a multiple of 4
const bool im2col_eligible = in_channels_per_group % 4 == 0;

bool use_im2col = false;
if (graph.device_is_mali()) {
// On Mali, im2col is faster than the general shader across the board.
use_im2col = im2col_eligible;
} else {
// Default: on Adreno and unknown GPU architectures, im2col is only
// beneficial for ungrouped convolutions with sufficient channel depth or
// small spatial output. For grouped convolutions, the general shader is
// more efficient (0.7-0.95x regression measured on Adreno).
use_im2col = im2col_eligible && groups == 1 &&
(in_channels_per_group >= 32 || spatial_out <= 4096);
}

if (use_im2col) {
q8ta_conv2d_im2col(graph, args);
Expand Down
22 changes: 21 additions & 1 deletion backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,27 @@ void add_q8ta_conv2d_pw_node(
const ValueRef bias_data,
const ValueRef packed_bias,
const uint32_t activation_type,
const ValueRef packed_int8_output);
const ValueRef packed_int8_output,
const int32_t groups = 1);

std::vector<int64_t> calculate_q8ta_im2col_sizes(
ComputeGraph* graph,
const ValueRef& input,
const ValueRef& output,
const ValueRef& kernel_size,
const ValueRef& groups);

void add_q8ta_im2col_node(
ComputeGraph& graph,
const ValueRef packed_int8_input,
const ValueRef kernel_size,
const ValueRef stride,
const ValueRef padding,
const ValueRef dilation,
const ValueRef groups,
const ValueRef packed_int8_output,
const ValueRef packed_int8_im2col,
const int32_t zp);

void q8ta_conv2d_im2col(ComputeGraph& graph, const std::vector<ValueRef>& args);

Expand Down
10 changes: 6 additions & 4 deletions backends/vulkan/runtime/graph/ops/impl/Q8taConv2dIm2Col.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,8 @@ void add_q8ta_im2col_node(
dilation,
groups);

// At the moment, the im2col path only supports non-grouped convolutions
VK_CHECK_COND(conv_params.groups == 1);
// The implementation also requires that input channels is a multiple of 4
// The implementation requires that input channels per group is a multiple of
// 4
VK_CHECK_COND(conv_params.in_channels_per_group % 4 == 0);

std::string kernel_name = "q8ta_im2col";
Expand Down Expand Up @@ -257,6 +256,8 @@ void q8ta_conv2d_im2col(
zp);

// Step 2: Perform pointwise convolution on the im2col result
const int32_t groups_val = graph.extract_scalar<int32_t>(groups);

add_q8ta_conv2d_pw_node(
graph,
packed_int8_im2col,
Expand All @@ -270,7 +271,8 @@ void q8ta_conv2d_im2col(
bias_data,
packed_bias,
activation_type_val,
packed_int8_output);
packed_int8_output,
groups_val);
}

REGISTER_OPERATORS {
Expand Down
38 changes: 15 additions & 23 deletions backends/vulkan/runtime/graph/ops/impl/Q8taConv2dPW.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,39 +200,40 @@ void add_q8ta_conv2d_pw_node(
const ValueRef bias_data,
const ValueRef packed_bias,
const uint32_t activation_type,
const ValueRef packed_int8_output) {
// Validate packed dim info for input and output tensors
// To maximize performance, the input tensor must be in 4W4C layout
const ValueRef packed_int8_output,
const int32_t groups) {
VK_CHECK_COND(q8ta_conv2d_check_4w4c_packed_dim_info(
graph.packed_dim_info_of(packed_int8_input)));
// However, the requirements for output tensor layout is flexible
VK_CHECK_COND(q8ta_conv2d_check_packed_dim_info(
graph.packed_dim_info_of(packed_int8_output)));

// Validate dtype is kInt8x4
VK_CHECK_COND(graph.dtype_of(packed_int8_input) == vkapi::kInt8x4);
VK_CHECK_COND(graph.dtype_of(packed_int8_output) == vkapi::kInt8x4);

// Compute K4_per_group and OC4_per_group from tensor dimensions and groups
// Input K dim (dim -3) = K_per_group * groups for grouped im2col, or IC for
// non-grouped. Either way, K4_per_group = div_up_4(K_dim / groups).
const int32_t K_dim = graph.size_at<int32_t>(-3, packed_int8_input);
const int32_t OC = graph.size_at<int32_t>(-3, packed_int8_output);
const int32_t K4_per_group =
static_cast<int32_t>(utils::div_up_4(K_dim / groups));
const int32_t OC4_per_group =
static_cast<int32_t>(utils::div_up_4(OC / groups));

float input_scale_val = graph.extract_scalar<float>(input_scale);
int32_t input_zp_val = graph.extract_scalar<int32_t>(input_zp);

float output_inv_scale_val = 1.0f / graph.extract_scalar<float>(output_scale);
int32_t output_zp_val = graph.extract_scalar<int32_t>(output_zp);

uint32_t apply_bias = 1;
if (graph.val_is_none(bias_data)) {
apply_bias = 0;
}

// Get input channel count for K4_per_group
const uint32_t IC = graph.size_at<uint32_t>(-3, packed_int8_input);
const uint32_t K4_per_group = utils::div_up_4(IC);

uint32_t apply_bias = graph.val_is_none(bias_data) ? 0u : 1u;
std::vector<PushConstantDataInfo> push_constants = {
PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)),
PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)),
PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
PushConstantDataInfo(&K4_per_group, sizeof(K4_per_group)),
PushConstantDataInfo(&OC4_per_group, sizeof(OC4_per_group)),
};

const bool use_hw_dot =
Expand All @@ -241,17 +242,13 @@ void add_q8ta_conv2d_pw_node(
use_hw_dot ? "q8ta_conv2d_pw" : "q8ta_conv2d_pw_fallback";
add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales));

// Pass metadata for both output and input tensors
vkapi::ParamsBindList param_buffers = {
graph.buffer_meta_ubo(packed_int8_output),
graph.buffer_meta_ubo(packed_int8_input)};

// Build spec constants: apply_bias, activation_type + layout constants
vkapi::SpecVarList spec_constants = {
apply_bias,
activation_type,
K4_per_group,
// Layout specialization constants
graph.hashed_layout_of(packed_int8_output),
graph.hashed_layout_of(packed_int8_input),
};
Expand All @@ -261,21 +258,16 @@ void add_q8ta_conv2d_pw_node(
VK_KERNEL_FROM_STR(kernel_name),
pick_q8ta_conv2d_pw_global_wg_size,
pick_q8ta_conv2d_pw_local_wg_size,
// Inputs and Outputs
{{packed_int8_output, vkapi::kWrite},
{{packed_int8_input,
packed_weight,
packed_weight_sums,
packed_weight_scales,
packed_bias},
vkapi::kRead}},
// Shader params buffers
param_buffers,
// Push Constants
push_constants,
// Specialization Constants
spec_constants,
// Resize args
{}));
}

Expand Down
4 changes: 3 additions & 1 deletion backends/vulkan/runtime/vk_api/Device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,9 @@ PhysicalDevice::PhysicalDevice(
device_type = DeviceType::SWIFTSHADER;
} else if (device_name.find("nvidia") != std::string::npos) {
device_type = DeviceType::NVIDIA;
} else if (device_name.find("mali") != std::string::npos) {
} else if (
device_name.find("mali") != std::string::npos ||
device_name.find("immortalis") != std::string::npos) {
device_type = DeviceType::MALI;
}
}
Expand Down
29 changes: 23 additions & 6 deletions backends/vulkan/test/custom_ops/test_q8ta_conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,9 +237,9 @@ std::vector<TestCase> generate_quantized_conv2d_easy_cases() {
test_cases.push_back(create_test_case_from_config(
config, vkapi::kFloat, fp_storage_type, int8_memory_layout));

// Test im2col implementation for non-grouped convolutions with input
// channels that are a multiple of 4 and stride_w == 1
if (config.groups == 1 && config.channels.in % 4 == 0) {
// Test im2col implementation when input channels per group is a
// multiple of 4
if ((config.channels.in / config.groups) % 4 == 0) {
test_cases.push_back(create_test_case_from_config(
config,
vkapi::kFloat,
Expand Down Expand Up @@ -379,6 +379,21 @@ static std::vector<TestCase> generate_quantized_conv2d_test_cases() {
Padding(2, 2),
Dilation(1, 1),
4},
// SceneX v9 grouped convolutions (large spatial)
{OutInChannels(128, 128),
InputSize2D(256, 256),
KernelSize(5, 5),
Stride(2, 2),
Padding(2, 2),
Dilation(1, 1),
4},
{OutInChannels(64, 64),
InputSize2D(256, 256),
KernelSize(3, 3),
Stride(1, 1),
Padding(1, 1),
Dilation(1, 1),
2},
// Deep channels + small spatial (ResNet50 stage 5 bottleneck)
{OutInChannels(512, 512),
InputSize2D(7, 7),
Expand Down Expand Up @@ -426,9 +441,11 @@ static std::vector<TestCase> generate_quantized_conv2d_test_cases() {
int8_memory_layout,
/*impl_selector=*/"general"));

// Test im2col implementation for non-grouped convolutions with input
// channels that are a multiple of 4 and stride_w == 1
if (config.groups == 1 && config.channels.in % 4 == 0) {
// Test im2col implementation when input channels per group is a
// multiple of 4
const int64_t in_channels_per_group =
config.channels.in / config.groups;
if (in_channels_per_group % 4 == 0) {
test_cases.push_back(create_test_case_from_config(
config,
vkapi::kFloat,
Expand Down
Loading