qualcomm-linux · tdarote · Jan 13, 2026 · lumag · Jan 13, 2026
@@ -0,0 +1,68 @@
+From 93646fb59c15da579acd91a3bfd2f4f4e21b9025 Mon Sep 17 00:00:00 2001
+From: Tushar Darote <tdarote@qti.qualcomm.com>
+Date: Wed, 31 Dec 2025 12:24:13 +0530
+Subject: [PATCH 1/8] fix(gpu): adjust work-group size and remove
+ Adreno-specific alignment
+
+- Updated `GetPossibleWorkGroups()` in `work_group_picking.cc`:
+  Use `std::min(kernel_info.max_work_group_size, gpu_info.GetMaxWorkGroupSizeForX())`
+  to ensure work-group size respects GPU X-dimension limits.
+
+- Removed Adreno-specific pitch alignment logic in `inference_context.cc`:
+  Eliminated conditional division by `bytes_per_pixel` for width alignment
+  to simplify and standardize buffer size calculation.
+
+Impact:
+- Improves compatibility across GPUs.
+- Prevents oversized work-groups and incorrect buffer alignment on Adreno devices.
+
+Upstream-Status: Pending
+
+Change-Id: I3de58b529b95255e68ced7a9915a6dc49c7fd9e5
+Signed-off-by: Tushar Darote <tdarote@qti.qualcomm.com>
+---
+ tensorflow/lite/delegates/gpu/cl/inference_context.cc      | 7 -------
+ .../lite/delegates/gpu/common/task/work_group_picking.cc   | 2 +-
+ 2 files changed, 1 insertion(+), 8 deletions(-)
+
+diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+index fa28a4c6944..795dd3374b8 100644
+--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
++++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+@@ -179,9 +179,6 @@ absl::Status GetBufferAssignment(
+       const size_t width = shape.b * shape.w;
+       const size_t height = shape.h * DivideRoundUp(shape.c, 4);
+       size_t width_pixel_alignment = gpu_info.opencl_info.image_pitch_alignment;
+-      if (gpu_info.IsAdreno() && width_pixel_alignment % bytes_per_pixel == 0) {
+-        width_pixel_alignment /= bytes_per_pixel;
+-      }
+       const size_t width_aligned = AlignByN(width, width_pixel_alignment);
+       buffer_size = width_aligned * bytes_per_pixel * height;
+     } else {
+@@ -659,10 +656,6 @@ absl::Status InferenceContext::AllocateBufferBasedTensors(
+                  : tensor_desc.GetBHWCShape().c);
+         size_t width_pixel_alignment =
+             gpu_info.opencl_info.image_pitch_alignment;
+-        if (gpu_info.IsAdreno() &&
+-            width_pixel_alignment % bytes_per_pixel == 0) {
+-          width_pixel_alignment /= bytes_per_pixel;
+-        }
+         RETURN_IF_ERROR(CreateTensorSharedImage2DBuffer(
+             *context, shared_buffers_[buffer_index].GetMemoryPtr(), tensor_desc,
+             width_pixel_alignment, &shared_buffer_tensors_[tensor_index]));
+diff --git a/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc b/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
+index 0469891179e..a4946b37fcd 100644
+--- a/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
++++ b/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
+@@ -278,7 +278,7 @@ void GetPossibleWorkGroups(TuningType tuning_type, const GpuInfo& gpu_info,
+   switch (tuning_type) {
+     case TuningType::kFast:
+       work_groups->push_back(
+-          GetWorkGroup(grid, kernel_info.max_work_group_size));
++          GetWorkGroup(grid, std::min(kernel_info.max_work_group_size, gpu_info.GetMaxWorkGroupSizeForX())));
+       return;
+     case TuningType::kExhaustive: {
+       GetWorkGroupsAlignedToGrid(gpu_info, kernel_info, grid, work_groups);
+-- 
+2.34.1
+
@@ -0,0 +1,27 @@
+From ed9406ca5db327cc0db01f40f2be43a7a470cf3f Mon Sep 17 00:00:00 2001
+From: Rob Clark <rob.clark@oss.qualcomm.com>
+Date: Thu, 3 Jul 2025 10:44:44 +0200
+Subject: [PATCH 2/8] softmax1x1: take reported max threads into account
+
+Upstream-Status: Pending
+---
+ tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc b/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc
+index 54c0050781e..e3b0b6e5509 100644
+--- a/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc
++++ b/tensorflow/lite/delegates/gpu/common/tasks/softmax1x1.cc
+@@ -124,6 +124,9 @@ std::string Softmax1x1::GetSoftmaxKernelCode(const OperationDef& op_def) {
+   args_.AddFloat("mask_w");
+
+   std::string c;
++  c += "__attribute__((work_group_size_hint(" + std::to_string(work_group_size_.x) +
++                                         ", " + std::to_string(work_group_size_.y) +
++                                         ", " + std::to_string(work_group_size_.z) + ")))\n";
+   c += "MAIN_FUNCTION($0) {\n";
+   if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
+     c += "  int linear_id = GROUP_ID_1;\n";
+-- 
+2.34.1
+
@@ -0,0 +1,27 @@
+From 241a7d975856b08913eb6f40cc4f9a2712b9f0d6 Mon Sep 17 00:00:00 2001
+From: Rob Clark <rob.clark@oss.qualcomm.com>
+Date: Mon, 14 Jul 2025 09:44:10 -0700
+Subject: [PATCH 3/8] work_group_picking: max_z_size cannot exceed max wg size
+
+If max_size is less than max_z_size then dividing by wg_z gives zero,
+leading to nonsense grid sizes like 0x0x64
+Upstream-Status: Pending
+---
+ tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc b/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
+index a4946b37fcd..915aee861d7 100644
+--- a/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
++++ b/tensorflow/lite/delegates/gpu/common/task/work_group_picking.cc
+@@ -304,6 +304,7 @@ void GetPossibleWorkGroupsConv(TuningType tuning_type, const GpuInfo& gpu_info,
+         max_z_size = gpu_info.adreno_info.IsAdreno3xx() ? 16 : 64;
+       }
+       max_z_size = std::min(max_z_size, gpu_info.GetMaxWorkGroupSizeForZ());
++      max_z_size = std::min(max_z_size, kernel_info.max_work_group_size);
+       work_groups->push_back(
+           GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
+       return;
+-- 
+2.34.1
+
@@ -0,0 +1,31 @@
+From c7df7a3627ef250bf7a391e3bc9e247753837e07 Mon Sep 17 00:00:00 2001
+From: Koen Kooi <koen.kooi@oss.qualcomm.com>
+Date: Thu, 9 Oct 2025 18:11:16 +0200
+Subject: [PATCH 4/8] cmake: lite/tools/benchmark: require protobuf through
+ find-package
+
+The tools needs headers for building and the library for linking, so
+have cmake find the things it needs.
+
+Upstream-Status: Pending
+Signed-off-by: Koen Kooi <koen.kooi@oss.qualcomm.com>
+---
+ tensorflow/lite/tools/benchmark/CMakeLists.txt | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/tensorflow/lite/tools/benchmark/CMakeLists.txt b/tensorflow/lite/tools/benchmark/CMakeLists.txt
+index 79986f6c0ec..b65ef61ccc7 100644
+--- a/tensorflow/lite/tools/benchmark/CMakeLists.txt
++++ b/tensorflow/lite/tools/benchmark/CMakeLists.txt
+@@ -16,6 +16,8 @@
+
+ # The benchmark tool for Tensorflow Lite.
+
++find_package(Protobuf REQUIRED)
++
+ populate_source_vars("${TFLITE_SOURCE_DIR}/tools/benchmark"
+   TFLITE_BENCHMARK_SRCS
+   FILTER "(_test|_plus_flex_main|_performance_options.*)\\.cc$"
+-- 
+2.34.1
+
@@ -0,0 +1,32 @@
+From d0d5843185b58b1fe20878a2725d6eb0352c3225 Mon Sep 17 00:00:00 2001
+From: Koen Kooi <koen.kooi@oss.qualcomm.com>
+Date: Thu, 9 Oct 2025 18:47:20 +0200
+Subject: [PATCH 5/8] cmake: lite/examples/label_image: fix protobuf library
+ name
+
+When wanting to link against 'libfoo.so', you need to specify '-lfoo' to
+the linker, not '-llibfoo', so remove the redundant 'lib' prefix for
+protobuf.
+
+Fixes: "ld: cannot find -llibprotobuf: No such file or directory"
+
+Signed-off-by: Koen Kooi <koen.kooi@oss.qualcomm.com>
+Upstream-Status: Pending
+---
+ tensorflow/lite/examples/label_image/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tensorflow/lite/examples/label_image/CMakeLists.txt b/tensorflow/lite/examples/label_image/CMakeLists.txt
+index 07ab2343ae5..2fcb09ce96e 100644
+--- a/tensorflow/lite/examples/label_image/CMakeLists.txt
++++ b/tensorflow/lite/examples/label_image/CMakeLists.txt
+@@ -84,5 +84,5 @@ target_compile_options(label_image
+ target_link_libraries(label_image
+   tensorflow-lite
+   profiling_info_proto
+-  libprotobuf
++  protobuf
+ )
+-- 
+2.34.1
+
@@ -0,0 +1,67 @@
+From 618e42da0ddeb6f1d27c24411a87628e202b139f Mon Sep 17 00:00:00 2001
+From: Tushar Darote <quic_tdarote@quicinc.com>
+Date: Thu, 9 Oct 2025 22:42:17 +0200
+Subject: [PATCH 6/8] feat(tflite): Add dynamic OpenCL library loading support
+
+This change adds support for dynamically loading the OpenCL library by:
+1. Detecting OpenCL installation using pkg-config in CMakeLists.txt
+2. Setting the appropriate library name based on OpenCL version
+3. Passing the library name as a compile-time definition to the OpenCL wrapper
+4. Modifying the OpenCL wrapper to use the configured library name instead of hardcoded defaults
+
+The change allows TensorFlow Lite to properly locate and load OpenCL libraries on systems where the default library naming may differ from the hardcoded "libOpenCL.so" path.
+
+Fixes potential issues with OpenCL library loading on various Linux distributions and systems with non-standard OpenCL installations.
+
+Upstream-Status: Pending
+Change-Id: I8987e4ce0206257b08e9720fee433770d7fefec6
+Signed-off-by: Tushar Darote <quic_tdarote@quicinc.com>
+---
+ tensorflow/lite/CMakeLists.txt                     | 14 ++++++++++++++
+ tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc |  4 ++++
+ 2 files changed, 18 insertions(+)
+
+diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
+index 3198ec61e0b..591dca9efaa 100644
+--- a/tensorflow/lite/CMakeLists.txt
++++ b/tensorflow/lite/CMakeLists.txt
+@@ -745,6 +745,20 @@ if (NOT BUILD_SHARED_LIBS)
+   list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DTFL_STATIC_LIBRARY_BUILD")
+ endif()
+
++find_package(PkgConfig)
++pkg_check_modules(OPENCL opencl)
++
++if(OPENCL_FOUND)
++  string(REGEX MATCH "^[0-9]+" OPENCL_VERSION_MAJOR ${OPENCL_VERSION})
++  set(CL_LIB_NAME "libOpenCL.so.${OPENCL_VERSION_MAJOR}")
++else()
++  set(CL_LIB_NAME "libOpenCL.so")
++endif()
++
++target_compile_definitions(tensorflow-lite
++  PRIVATE CL_LIB_NAME=\"${CL_LIB_NAME}\"
++)
++
+ target_compile_options(tensorflow-lite
+   PUBLIC ${TFLITE_TARGET_PUBLIC_OPTIONS}
+   PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
+diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+index 2eb95df35ae..49551fd372a 100644
+--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
++++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+@@ -119,7 +119,11 @@ absl::Status LoadOpenCLOnce() {
+   static const char* kClLibName =
+       "/System/Library/Frameworks/OpenCL.framework/OpenCL";
+ #else
++#ifndef CL_LIB_NAME
+   static const char* kClLibName = "libOpenCL.so";
++#else
++  static const char* kClLibName = CL_LIB_NAME;
++#endif
+ #endif
+ #ifdef __ANDROID__
+   libopencl = AndroidDlopenSphalLibrary(kClLibName, RTLD_NOW | RTLD_LOCAL);
+-- 
+2.34.1
+