Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
From 93646fb59c15da579acd91a3bfd2f4f4e21b9025 Mon Sep 17 00:00:00 2001
From: Tushar Darote <tdarote@qti.qualcomm.com>
Date: Wed, 31 Dec 2025 12:24:13 +0530
Subject: [PATCH 1/8] fix(gpu): adjust work-group size and remove
Adreno-specific alignment

- Updated `GetPossibleWorkGroups()` in `work_group_picking.cc`:
Use `std::min(kernel_info.max_work_group_size, gpu_info.GetMaxWorkGroupSizeForX())`
to ensure work-group size respects GPU X-dimension limits.

- Removed Adreno-specific pitch alignment logic in `inference_context.cc`:
Eliminated conditional division by `bytes_per_pixel` for width alignment
to simplify and standardize buffer size calculation.

Impact:
- Improves compatibility across GPUs.
- Prevents oversized work-groups and incorrect buffer alignment on Adreno devices.

Upstream-Status: Pending
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please submit upstream.


Change-Id: I3de58b529b95255e68ced7a9915a6dc49c7fd9e5
Signed-off-by: Tushar Darote <tdarote@qti.qualcomm.com>
---
tensorflow/lite/delegates/gpu/cl/inference_context.cc | 7 -------
.../lite/delegates/gpu/common/task/work_group_picking.cc | 2 +-
2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index fa28a4c6944..795dd3374b8 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -179,9 +179,6 @@ absl::Status GetBufferAssignment(
const size_t width = shape.b * shape.w;
const size_t height = shape.h * DivideRoundUp(shape.c, 4);
size_t width_pixel_alignment = gpu_info.opencl_info.image_pitch_alignment;
- if (gpu_info.IsAdreno() && width_pixel_alignment % bytes_per_pixel == 0) {
- width_pixel_alignment /= bytes_per_pixel;
- }
const size_t width_aligned = AlignByN(width, width_pixel_alignment);
buffer_size = width_aligned * bytes_per_pixel * height;
} else {
@@ -659,10 +656,6 @@ absl::Status InferenceContext::AllocateBufferBasedTensors(
: tensor_desc.GetBHWCShape().c);
size_t width_pixel_alignment =
gpu_info.opencl_info.image_pitch_alignment;
- if (gpu_info.IsAdreno() &&
- width_pixel_alignment % bytes_per_pixel == 0) {
- width_pixel_alignment /= bytes_per_pixel;
- }
RETURN_IF_ERROR(CreateTensorSharedImage2DBuffer(
*context, shared_buffers_[buffer_index].GetMemoryPtr(), tensor_desc,
width_pixel_alignment, &shared_buffer_tensors_[tensor_index]));
diff --git a/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc b/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
index 0469891179e..a4946b37fcd 100644
--- a/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
@@ -278,7 +278,7 @@ void GetPossibleWorkGroups(TuningType tuning_type, const GpuInfo& gpu_info,
switch (tuning_type) {
case TuningType::kFast:
work_groups->push_back(
- GetWorkGroup(grid, kernel_info.max_work_group_size));
+ GetWorkGroup(grid, std::min(kernel_info.max_work_group_size, gpu_info.GetMaxWorkGroupSizeForX())));
return;
case TuningType::kExhaustive: {
GetWorkGroupsAlignedToGrid(gpu_info, kernel_info, grid, work_groups);
--
2.34.1

Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
From ed9406ca5db327cc0db01f40f2be43a7a470cf3f Mon Sep 17 00:00:00 2001
From: Rob Clark <rob.clark@oss.qualcomm.com>
Date: Thu, 3 Jul 2025 10:44:44 +0200
Subject: [PATCH 2/8] softmax1x1: take reported max threads into account

Upstream-Status: Pending
---
tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc | 3 +++
1 file changed, 3 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc b/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc
index 54c0050781e..e3b0b6e5509 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc
@@ -124,6 +124,9 @@ std::string Softmax1x1::GetSoftmaxKernelCode(const OperationDef& op_def) {
args_.AddFloat("mask_w");

std::string c;
+ c += "__attribute__((work_group_size_hint(" + std::to_string(work_group_size_.x) +
+ ", " + std::to_string(work_group_size_.y) +
+ ", " + std::to_string(work_group_size_.z) + ")))\n";
c += "MAIN_FUNCTION($0) {\n";
if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
c += " int linear_id = GROUP_ID_1;\n";
--
2.34.1

Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
From 241a7d975856b08913eb6f40cc4f9a2712b9f0d6 Mon Sep 17 00:00:00 2001
From: Rob Clark <rob.clark@oss.qualcomm.com>
Date: Mon, 14 Jul 2025 09:44:10 -0700
Subject: [PATCH 3/8] work_group_picking: max_z_size cannot exceed max wg size

If max_size is less than max_z_size then dividing by wg_z gives zero,
leading to nonsense grid sizes like 0x0x64
Upstream-Status: Pending
---
tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc | 1 +
1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc b/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
index a4946b37fcd..915aee861d7 100644
--- a/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
@@ -304,6 +304,7 @@ void GetPossibleWorkGroupsConv(TuningType tuning_type, const GpuInfo& gpu_info,
max_z_size = gpu_info.adreno_info.IsAdreno3xx() ? 16 : 64;
}
max_z_size = std::min(max_z_size, gpu_info.GetMaxWorkGroupSizeForZ());
+ max_z_size = std::min(max_z_size, kernel_info.max_work_group_size);
work_groups->push_back(
GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
return;
--
2.34.1

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
From c7df7a3627ef250bf7a391e3bc9e247753837e07 Mon Sep 17 00:00:00 2001
From: Koen Kooi <koen.kooi@oss.qualcomm.com>
Date: Thu, 9 Oct 2025 18:11:16 +0200
Subject: [PATCH 4/8] cmake: lite/tools/benchmark: require protobuf through
find-package

The tools needs headers for building and the library for linking, so
have cmake find the things it needs.

Upstream-Status: Pending
Signed-off-by: Koen Kooi <koen.kooi@oss.qualcomm.com>
---
tensorflow/lite/tools/benchmark/CMakeLists.txt | 2 ++
1 file changed, 2 insertions(+)

diff --git a/tensorflow/lite/tools/benchmark/CMakeLists.txt b/tensorflow/lite/tools/benchmark/CMakeLists.txt
index 79986f6c0ec..b65ef61ccc7 100644
--- a/tensorflow/lite/tools/benchmark/CMakeLists.txt
+++ b/tensorflow/lite/tools/benchmark/CMakeLists.txt
@@ -16,6 +16,8 @@

# The benchmark tool for Tensorflow Lite.

+find_package(Protobuf REQUIRED)
+
populate_source_vars("${TFLITE_SOURCE_DIR}/tools/benchmark"
TFLITE_BENCHMARK_SRCS
FILTER "(_test|_plus_flex_main|_performance_options.*)\\.cc$"
--
2.34.1

Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
From d0d5843185b58b1fe20878a2725d6eb0352c3225 Mon Sep 17 00:00:00 2001
From: Koen Kooi <koen.kooi@oss.qualcomm.com>
Date: Thu, 9 Oct 2025 18:47:20 +0200
Subject: [PATCH 5/8] cmake: lite/examples/label_image: fix protobuf library
name

When wanting to link against 'libfoo.so', you need to specify '-lfoo' to
the linker, not '-llibfoo', so remove the redundant 'lib' prefix for
protobuf.

Fixes: "ld: cannot find -llibprotobuf: No such file or directory"

Signed-off-by: Koen Kooi <koen.kooi@oss.qualcomm.com>
Upstream-Status: Pending
---
tensorflow/lite/examples/label_image/CMakeLists.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/examples/label_image/CMakeLists.txt b/tensorflow/lite/examples/label_image/CMakeLists.txt
index 07ab2343ae5..2fcb09ce96e 100644
--- a/tensorflow/lite/examples/label_image/CMakeLists.txt
+++ b/tensorflow/lite/examples/label_image/CMakeLists.txt
@@ -84,5 +84,5 @@ target_compile_options(label_image
target_link_libraries(label_image
tensorflow-lite
profiling_info_proto
- libprotobuf
+ protobuf
)
--
2.34.1

Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
From 618e42da0ddeb6f1d27c24411a87628e202b139f Mon Sep 17 00:00:00 2001
From: Tushar Darote <quic_tdarote@quicinc.com>
Date: Thu, 9 Oct 2025 22:42:17 +0200
Subject: [PATCH 6/8] feat(tflite): Add dynamic OpenCL library loading support

This change adds support for dynamically loading the OpenCL library by:
1. Detecting OpenCL installation using pkg-config in CMakeLists.txt
2. Setting the appropriate library name based on OpenCL version
3. Passing the library name as a compile-time definition to the OpenCL wrapper
4. Modifying the OpenCL wrapper to use the configured library name instead of hardcoded defaults

The change allows TensorFlow Lite to properly locate and load OpenCL libraries on systems where the default library naming may differ from the hardcoded "libOpenCL.so" path.

Fixes potential issues with OpenCL library loading on various Linux distributions and systems with non-standard OpenCL installations.

Upstream-Status: Pending
Change-Id: I8987e4ce0206257b08e9720fee433770d7fefec6
Signed-off-by: Tushar Darote <quic_tdarote@quicinc.com>
---
tensorflow/lite/CMakeLists.txt | 14 ++++++++++++++
tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc | 4 ++++
2 files changed, 18 insertions(+)

diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index 3198ec61e0b..591dca9efaa 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -745,6 +745,20 @@ if (NOT BUILD_SHARED_LIBS)
list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DTFL_STATIC_LIBRARY_BUILD")
endif()

+find_package(PkgConfig)
+pkg_check_modules(OPENCL opencl)
+
+if(OPENCL_FOUND)
+ string(REGEX MATCH "^[0-9]+" OPENCL_VERSION_MAJOR ${OPENCL_VERSION})
+ set(CL_LIB_NAME "libOpenCL.so.${OPENCL_VERSION_MAJOR}")
+else()
+ set(CL_LIB_NAME "libOpenCL.so")
+endif()
+
+target_compile_definitions(tensorflow-lite
+ PRIVATE CL_LIB_NAME=\"${CL_LIB_NAME}\"
+)
+
target_compile_options(tensorflow-lite
PUBLIC ${TFLITE_TARGET_PUBLIC_OPTIONS}
PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index 2eb95df35ae..49551fd372a 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -119,7 +119,11 @@ absl::Status LoadOpenCLOnce() {
static const char* kClLibName =
"/System/Library/Frameworks/OpenCL.framework/OpenCL";
#else
+#ifndef CL_LIB_NAME
static const char* kClLibName = "libOpenCL.so";
+#else
+ static const char* kClLibName = CL_LIB_NAME;
+#endif
#endif
#ifdef __ANDROID__
libopencl = AndroidDlopenSphalLibrary(kClLibName, RTLD_NOW | RTLD_LOCAL);
--
2.34.1

Loading