From 5f7319f157be9a4894326b65a5043dad6f5a6669 Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Wed, 7 Jan 2026 18:25:23 +0900
Subject: [PATCH 01/70] Add Google Highway SIMD acceleration to ImageBufAlgo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Optional SIMD optimizations for selected ImageBufAlgo operations using the Google Highway library:
• add/sub
• mul/div
• mad
• resample
Adds CMake and build system support, new implementation helpers, and developer documentation.

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 CMakeLists.txt                             |  14 +
 docs/dev/Architecture.md                   |   4 +
 docs/dev/ImageBufAlgo_Highway.md           | 264 ++++++
 src/cmake/externalpackages.cmake           |   3 +
 src/doc/imagebufalgo.rst                   |  62 ++
 src/doc/imageioapi.rst                     |  32 +-
 src/include/OpenImageIO/platform.h         |  10 +-
 src/libOpenImageIO/CMakeLists.txt          |  11 +
 src/libOpenImageIO/imagebufalgo_addsub.cpp | 311 ++++++-
 src/libOpenImageIO/imagebufalgo_hwy_pvt.h  | 970 +++++++++++++++++++++
 src/libOpenImageIO/imagebufalgo_mad.cpp    | 136 +--
 src/libOpenImageIO/imagebufalgo_muldiv.cpp | 185 +++-
 src/libOpenImageIO/imagebufalgo_xform.cpp  | 342 +++++++-
 src/libOpenImageIO/imageio.cpp             |   9 +
 14 files changed, 2237 insertions(+), 116 deletions(-)
 create mode 100644 docs/dev/ImageBufAlgo_Highway.md
 create mode 100644 src/libOpenImageIO/imagebufalgo_hwy_pvt.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ed1589cfc..729acdd316 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,6 +112,20 @@ else ()
 endif ()
 option (${PROJ_NAME}_BUILD_TOOLS "Build the command-line tools" ON)
 option (${PROJ_NAME}_BUILD_TESTS "Build the unit tests" ON)
+
+# Google Highway SIMD acceleration for selected ImageBufAlgo ops. This is an
+# optional optimization dependency: if enabled but not found, it will be
+# compiled out.
+#
+# Back-compat: honor -DUSE_HWY=OFF by mapping it to OIIO_USE_HWY if the latter
+# was not explicitly provided.
+if (DEFINED USE_HWY AND NOT DEFINED OIIO_USE_HWY)
+    set (OIIO_USE_HWY ${USE_HWY} CACHE BOOL
+         "Enable Google Highway SIMD optimizations (if Highway is available)" FORCE)
+else ()
+    option (OIIO_USE_HWY "Enable Google Highway SIMD optimizations (if Highway is available)" ON)
+endif ()
+
 set (OIIO_LIBNAME_SUFFIX "" CACHE STRING
      "Optional name appended to ${PROJECT_NAME} libraries that are built")
 option (BUILD_OIIOUTIL_ONLY "If ON, will build *only* libOpenImageIO_Util" OFF)
diff --git a/docs/dev/Architecture.md b/docs/dev/Architecture.md
index 5e52bf4143..72f81d8907 100644
--- a/docs/dev/Architecture.md
+++ b/docs/dev/Architecture.md
@@ -117,6 +117,10 @@ objects. These algorithms include simple operations like copying, resizing,
 and compositing images, as well as more complex operations like color
 conversions, resizing, filtering, etc.
 
+Some performance-critical `ImageBufAlgo` implementations have SIMD-accelerated
+paths using Google Highway. For implementation details and guidance for adding
+new kernels, see `docs/dev/ImageBufAlgo_Highway.md`.
+
 ## Image caching: TextureSystem and ImageCache
 
 There are situations where ImageBuf is still not the right abstraction,
diff --git a/docs/dev/ImageBufAlgo_Highway.md b/docs/dev/ImageBufAlgo_Highway.md
new file mode 100644
index 0000000000..960766978c
--- /dev/null
+++ b/docs/dev/ImageBufAlgo_Highway.md
@@ -0,0 +1,264 @@
+ImageBufAlgo Highway (hwy) Implementation Guide
+==============================================
+
+This document explains how OpenImageIO uses Google Highway (hwy) to accelerate
+selected `ImageBufAlgo` operations, and how to add or modify kernels in a way
+that preserves OIIO semantics while keeping the code maintainable.
+
+This is a developer-facing document about the implementation structure in
+`src/libOpenImageIO/`. It does not describe the public API behavior of the
+algorithms.
+
+
+Goals and non-goals
+-------------------
+
+Goals:
+- Make the hwy-backed code paths easy to read and easy to extend.
+- Centralize repetitive boilerplate (type conversion, tails, ROI pointer math).
+- Preserve OIIO's numeric semantics (normalized integer model).
+- Keep scalar fallbacks as the source of truth for tricky layout cases.
+
+Non-goals:
+- Explain Highway itself. Refer to the upstream Highway documentation.
+- Guarantee that every ImageBufAlgo op has a hwy implementation.
+
+
+Where the code lives
+--------------------
+
+Core helpers:
+- `src/libOpenImageIO/imagebufalgo_hwy_pvt.h`
+
+Typical hwy call sites:
+- `src/libOpenImageIO/imagebufalgo_addsub.cpp`
+- `src/libOpenImageIO/imagebufalgo_muldiv.cpp`
+- `src/libOpenImageIO/imagebufalgo_mad.cpp`
+- `src/libOpenImageIO/imagebufalgo_pixelmath.cpp`
+- `src/libOpenImageIO/imagebufalgo_xform.cpp` (some ops are hwy-accelerated)
+
+
+Enabling and gating the hwy path
+-------------------------------
+
+The hwy path is only used when:
+- Highway usage is enabled at runtime (`OIIO::pvt::enable_hwy`).
+- The relevant `ImageBuf` objects have local pixel storage (`localpixels()` is
+  non-null), meaning the data is in process memory rather than accessed through
+  an `ImageCache` tile abstraction.
+- The operation can be safely expressed as contiguous streams of pixels/channels
+  for the hot path, or the code falls back to a scalar implementation for
+  strided/non-contiguous layouts.
+
+The common gating pattern looks like:
+- In a typed `*_impl` dispatcher: check `OIIO::pvt::enable_hwy` and `localpixels`
+  and then call a `*_impl_hwy` function; otherwise call `*_impl_scalar`.
+
+Important: the hwy path is an optimization. Correctness must not depend on hwy.
+
+
+OIIO numeric semantics: why we promote to float
+----------------------------------------------
+
+OIIO treats integer image pixels as normalized values:
+- Unsigned integers represent [0, 1].
+- Signed integers represent approximately [-1, 1] with clamping for INT_MIN.
+
+Therefore, most pixel math must be performed in float (or double) space, even
+when the stored data is integer. This is why the hwy layer uses the
+"LoadPromote/Operate/DemoteStore" pattern.
+
+For additional discussion (and pitfalls of saturating integer arithmetic), see:
+- `HIGHWAY_SATURATING_ANALYSIS.md`
+
+
+The core pattern: LoadPromote -> RunHwy* -> DemoteStore
+-------------------------------------------------------
+
+The helper header `imagebufalgo_hwy_pvt.h` defines the reusable building blocks:
+
+1) Computation type selection
+   - `SimdMathType<T>` selects `float` for most types, and `double` only when
+     the destination type is `double`.
+
+   Rationale:
+   - Float math is significantly faster on many targets.
+   - For OIIO, integer images are normalized to [0,1] (or ~[-1,1]), so float
+     precision is sufficient for typical image processing workloads.
+
+2) Load and promote (with normalization)
+   - `LoadPromote(d, ptr)` and `LoadPromoteN(d, ptr, count)` load values and
+     normalize integer ranges into the computation space.
+
+   Rationale:
+   - Consolidates all normalization and conversion logic in one place.
+   - Prevents subtle drift where each operation re-implements integer scaling.
+   - Ensures tail handling ("N" variants) is correct and consistent.
+
+3) Demote and store (with denormalization/clamp/round)
+   - `DemoteStore(d, ptr, v)` and `DemoteStoreN(d, ptr, v, count)` reverse the
+     normalization and store results in the destination pixel type.
+
+   Rationale:
+   - Centralizes rounding and clamping behavior for all destination types.
+   - Ensures output matches OIIO scalar semantics.
+
+4) Generic kernel runners (streaming arrays)
+   - `RunHwyUnaryCmd`, `RunHwyCmd` (binary), `RunHwyTernaryCmd`
+   - These are the primary entry points for most hwy kernels.
+
+   Rationale:
+   - Encapsulates lane iteration and tail processing once.
+   - The call sites only provide the per-lane math lambda, not the boilerplate.
+
+
+Native integer runners: when they are valid
+-------------------------------------------
+
+Some operations are "scale-invariant" under OIIO's normalized integer model.
+For example, for unsigned integer add:
+- `(a/max + b/max)` in float space, then clamped to [0,1], then scaled by max
+  matches saturated integer add `SaturatedAdd(a, b)` for the same bit depth.
+
+For those cases, `imagebufalgo_hwy_pvt.h` provides:
+- `RunHwyUnaryNativeInt<T>`
+- `RunHwyBinaryNativeInt<T>`
+
+These should only be used when all of the following are true:
+- The operation is known to be scale-invariant under the normalization model.
+- Input and output types are the same integral type.
+- The operation does not depend on mixed types or float-range behavior.
+
+Rationale:
+- Avoids promotion/demotion overhead and can be materially faster.
+- Must be opt-in and explicit, because many operations are NOT compatible with
+  raw integer arithmetic (e.g. multiplication, division, pow).
+
+
+Local pixel pointer helpers: reducing boilerplate safely
+-------------------------------------------------------
+
+Most hwy call sites need repeated pointer and stride computations:
+- Pixel size in bytes.
+- Scanline size in bytes.
+- Base pointer to local pixels.
+- Per-row pointer for a given ROI and scanline.
+- Per-pixel pointer for non-contiguous fallbacks.
+
+To centralize that, `imagebufalgo_hwy_pvt.h` defines:
+- `HwyPixels(ImageBuf&)` and `HwyPixels(const ImageBuf&)`
+  returning a small view (`HwyLocalPixelsView`) with:
+  - base pointer (`std::byte*` / `const std::byte*`)
+  - `pixel_bytes`, `scanline_bytes`
+  - `xbegin`, `ybegin`, `nchannels`
+- `RoiNChannels(roi)` for `roi.chend - roi.chbegin`
+- `ChannelsContiguous<T>(view, nchannels)`:
+  true only when the pixel stride exactly equals `nchannels * sizeof(T)`
+- `PixelBase(view, x, y)`, `ChannelPtr<T>(view, x, y, ch)`
+- `RoiRowPtr<T>(view, y, roi)` for the start of the ROI row at `roi.xbegin` and
+  `roi.chbegin`.
+
+Rationale:
+- Avoids duplicating fragile byte-offset math across many ops.
+- Makes it visually obvious what the code is doing: "get row pointer" vs
+  "compute offset by hand."
+- Makes non-contiguous fallback paths less error-prone by reusing the same
+  pointer computations.
+
+Important: these helpers are only valid for `ImageBuf` instances with local
+pixels (`localpixels()` non-null). The call sites must check that before using
+them.
+
+
+Contiguous fast path vs non-contiguous fallback
+-----------------------------------------------
+
+Most operations implement two paths:
+
+1) Contiguous fast path:
+   - Used when pixels are tightly packed for the ROI's channel range.
+   - The operation is executed as a 1D stream of length:
+     `roi.width() * (roi.chend - roi.chbegin)`
+   - Uses `RunHwy*Cmd` (or native-int runner) and benefits from:
+     - fewer branches
+     - fewer pointer computations
+     - auto tail handling
+
+2) Non-contiguous fallback:
+   - Used when pixels have padding, unusual strides, or channel subsets that do
+     not form a dense stream.
+   - Typically loops pixel-by-pixel and channel-by-channel.
+   - May still use the `ChannelPtr` helpers to compute correct addresses.
+
+Rationale:
+- The contiguous path is where SIMD delivers large gains.
+- Trying to SIMD-optimize arbitrary strided layouts often increases complexity
+  and risk for marginal benefit. Keeping a scalar fallback preserves
+  correctness and maintainability.
+
+
+How to add a new hwy kernel
+---------------------------
+
+Step 1: Choose the kernel shape
+- Unary: `R = f(A)` -> use `RunHwyUnaryCmd`
+- Binary: `R = f(A, B)` -> use `RunHwyCmd`
+- Ternary: `R = f(A, B, C)` -> use `RunHwyTernaryCmd`
+
+Step 2: Decide if a native-int fast path is valid
+- Only for scale-invariant ops and same-type integral inputs/outputs.
+- Use `RunHwyUnaryNativeInt` / `RunHwyBinaryNativeInt` when safe.
+- Otherwise, always use the promote/demote runners.
+
+Step 3: Implement the hwy body with a contig check
+Typical structure inside `*_impl_hwy`:
+- Acquire views once:
+  - `auto Rv = HwyPixels(R);`
+  - `auto Av = HwyPixels(A);` etc.
+- In the parallel callback:
+  - compute `nchannels = RoiNChannels(roi)`
+  - compute `contig = ChannelsContiguous<...>(...)` for each image
+  - for each scanline y:
+    - `Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi);`
+    - `const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);` etc.
+    - if contig: call `RunHwy*` with `n = roi.width() * nchannels`
+    - else: fall back per pixel, per channel
+
+Step 4: Keep the scalar path as the reference
+- The scalar implementation should remain correct for all layouts and types.
+- The hwy path should match scalar results for supported cases.
+
+
+Design rationale summary
+------------------------
+
+This design intentionally separates concerns:
+- Type conversion and normalization are centralized (`LoadPromote`,
+  `DemoteStore`).
+- SIMD lane iteration and tail handling are centralized (`RunHwy*` runners).
+- Image address computations are centralized (`HwyPixels`, `RoiRowPtr`,
+  `ChannelPtr`).
+- Operation-specific code is reduced to short lambdas expressing the math.
+
+This makes the hwy layer:
+- Easier to maintain: fewer places to fix bugs when semantics change.
+- Easier to extend: adding an op mostly means writing the math lambda and the
+  dispatch glue.
+- Safer: correctness for unusual layouts remains in scalar fallbacks.
+
+
+Notes on `half`
+---------------
+
+The hwy conversion helpers handle `half` by converting through
+`hwy::float16_t`. This currently assumes the underlying `half` representation
+is compatible with how Highway loads/stores 16-bit floats.
+
+If this assumption is revisited in the future, it should be changed as a
+separate, explicit correctness/performance project.
+
+
+<!-- SPDX-License-Identifier: CC-BY-4.0 -->
+<!-- Copyright Contributors to the OpenImageIO Project. -->
+
+
diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake
index 12467ae6b6..6d02f267fa 100644
--- a/src/cmake/externalpackages.cmake
+++ b/src/cmake/externalpackages.cmake
@@ -225,6 +225,9 @@ if (USE_QT AND OPENGL_FOUND)
 endif ()
 
 
+# Google Highway for SIMD (optional optimization)
+checked_find_package (hwy ENABLE ${OIIO_USE_HWY})
+
 # Tessil/robin-map
 checked_find_package (Robinmap REQUIRED
                       VERSION_MIN 1.2.0
diff --git a/src/doc/imagebufalgo.rst b/src/doc/imagebufalgo.rst
index b013ce0d20..200417accc 100644
--- a/src/doc/imagebufalgo.rst
+++ b/src/doc/imagebufalgo.rst
@@ -152,6 +152,68 @@ the computation without spawning additional threads, which might tend to
 crowd out the other application threads.
 
 
+SIMD Performance and Data Types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Many ImageBufAlgo operations use SIMD (Single Instruction, Multiple Data)
+optimizations powered by the Google Highway library to achieve significant
+performance improvements, particularly for integer image formats.
+
+**Integer Type Optimizations:**
+
+OpenImageIO treats all integer images as normalized Standard Dynamic Range
+(SDR) data:
+
+* Unsigned integers (``uint8``, ``uint16``, ``uint32``, ``uint64``) are
+  normalized to the [0.0, 1.0] range: ``float_value = int_value / max_value``
+* Signed integers (``int8``, ``int16``, ``int32``, ``int64``) are normalized
+  to approximately the [-1.0, 1.0] range: ``float_value = int_value / max_value``
+
+Most ImageBufAlgo operations convert integer data to float, perform the
+operation, and convert back. Highway SIMD provides 3-5x speedup for these
+operations compared to scalar code.
+
+**Scale-Invariant Operations:**
+
+Certain operations are *scale-invariant*, meaning they produce identical
+results whether performed on raw integers or normalized floats. For these
+operations, OpenImageIO uses native integer SIMD paths that avoid float
+conversion entirely, achieving 6-12x speedup (2-3x faster than the float
+promotion path):
+
+* ``add``, ``sub`` (with saturation)
+* ``min``, ``max``
+* ``abs``, ``absdiff``
+
+These optimizations automatically activate when all input and output images
+have matching integer types (e.g., all ``uint8``). When types differ or when
+mixing integer and float images, the standard float promotion path is used.
+
+**Controlling SIMD Optimizations:**
+
+Highway SIMD is enabled by default. To disable it globally::
+
+    OIIO::attribute("enable_hwy", 0);
+
+Or via environment variable::
+
+    export OPENIMAGEIO_ENABLE_HWY=0
+
+This is primarily useful for debugging or performance comparison. In normal
+use, the optimizations should remain enabled for best performance.
+
+**Performance Expectations:**
+
+Typical speedups with Highway SIMD (compared to scalar code):
+
+* Float operations: 3-5x faster
+* Integer operations (with float conversion): 3-5x faster
+* Integer scale-invariant operations (native int): 6-12x faster
+* Half-float operations: 3-5x faster
+
+Actual performance depends on the specific operation, image size, data types,
+and hardware capabilities (AVX2, AVX-512, ARM NEON, etc.).
+
 
 .. _sec-iba-patterns:
 
diff --git a/src/doc/imageioapi.rst b/src/doc/imageioapi.rst
index d2d6b192b4..dca5a66da5 100644
--- a/src/doc/imageioapi.rst
+++ b/src/doc/imageioapi.rst
@@ -397,16 +397,36 @@ inside the source code.
     line, but not the full human-readable command line. (This was added in
     OpenImageIO 2.5.11.)
 
+.. cpp:var:: OPENIMAGEIO_ENABLE_HWY
+
+    Controls whether to use Google Highway SIMD library optimizations for
+    ImageBufAlgo operations. If set to "1" (the default), Highway SIMD
+    optimizations will be enabled for supported operations, providing
+    significant performance improvements (typically 3-12x faster) on integer
+    image types. If set to "0", these optimizations will be disabled and fall
+    back to scalar implementations.
+
+    This can also be controlled at runtime via::
+
+        OIIO::attribute("enable_hwy", 1);  // enable (default)
+        OIIO::attribute("enable_hwy", 0);  // disable
+
+    Note: Highway SIMD optimizations are particularly beneficial for integer
+    image formats (uint8, uint16, int8, int16, uint32, int32, etc.) and provide
+    additional speedup for scale-invariant operations (add, sub, min, max,
+    absdiff) that can operate directly on integer data without float conversion.
+    (This was added in OpenImageIO 3.1.)
+
 .. cpp:var:: OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH
 
-    Windows only. Mimics the DLL-loading behavior of Python 3.7 and earlier. 
-    If set to "1", all directories under ``PATH`` will be added to the DLL load 
+    Windows only. Mimics the DLL-loading behavior of Python 3.7 and earlier.
+    If set to "1", all directories under ``PATH`` will be added to the DLL load
     path before attempting to import the OpenImageIO module. (This was added in
     OpenImageIO 3.0.3.0)
 
-    Note: This "opt-in-style" behavior replaces and inverts the "opt-out-style" 
-    Windows DLL-loading behavior governed by the now-defunct `OIIO_LOAD_DLLS_FROM_PATH` 
-    environment variable (added in OpenImageIO 2.4.0/2.3.18). 
+    Note: This "opt-in-style" behavior replaces and inverts the "opt-out-style"
+    Windows DLL-loading behavior governed by the now-defunct `OIIO_LOAD_DLLS_FROM_PATH`
+    environment variable (added in OpenImageIO 2.4.0/2.3.18).
 
-    In other words, to reproduce the default Python-module-loading behavior of 
+    In other words, to reproduce the default Python-module-loading behavior of
     earlier versions of OIIO, set ``OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH=1``.
diff --git a/src/include/OpenImageIO/platform.h b/src/include/OpenImageIO/platform.h
index e4760e5545..6ee6d107e8 100644
--- a/src/include/OpenImageIO/platform.h
+++ b/src/include/OpenImageIO/platform.h
@@ -39,6 +39,7 @@
 #endif
 
 #ifdef _MSC_VER
+#    include <malloc.h>  // for alloca
 #    include <intrin.h>
 #endif
 
@@ -305,8 +306,15 @@
 /// enough to cause trouble). Consider using the OIIO_ALLOCATE_STACK_OR_HEAP
 /// idiom rather than a direct OIIO_ALLOCA if you aren't sure the item will
 /// be small.
-#if defined(__GNUC__)
+#if defined(__has_include)
+#    if __has_include(<alloca.h>)
+#        include <alloca.h>  // for alloca (when available)
+#    endif
+#endif
+#if defined(__GNUC__) || defined(__clang__)
 #    define OIIO_ALLOCA(type, size) (assert(size < (1<<20)), (size) != 0 ? ((type*)__builtin_alloca((size) * sizeof(type))) : nullptr)
+#elif defined(_MSC_VER)
+#    define OIIO_ALLOCA(type, size) (assert(size < (1<<20)), (size) != 0 ? ((type*)_alloca((size) * sizeof(type))) : nullptr)
 #else
 #    define OIIO_ALLOCA(type, size) (assert(size < (1<<20)), (size) != 0 ? ((type*)alloca((size) * sizeof(type))) : nullptr)
 #endif
diff --git a/src/libOpenImageIO/CMakeLists.txt b/src/libOpenImageIO/CMakeLists.txt
index f2459b2d32..d813606755 100644
--- a/src/libOpenImageIO/CMakeLists.txt
+++ b/src/libOpenImageIO/CMakeLists.txt
@@ -168,6 +168,17 @@ target_link_libraries (OpenImageIO
             ${CMAKE_DL_LIBS}
         )
 
+# Google Highway (hwy) is an optional optimization dependency.
+set (_oiio_use_hwy 0)
+if (OIIO_USE_HWY AND TARGET hwy::hwy)
+    set (_oiio_use_hwy 1)
+    target_link_libraries (OpenImageIO PRIVATE hwy::hwy)
+    if (TARGET hwy::hwy_contrib)
+        target_link_libraries (OpenImageIO PRIVATE hwy::hwy_contrib)
+    endif ()
+endif ()
+target_compile_definitions (OpenImageIO PRIVATE OIIO_USE_HWY=${_oiio_use_hwy})
+
 if (WIN32)
     target_link_libraries (OpenImageIO PRIVATE psapi)
 endif()
diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index c7a4d83e9c..e8a25d86bf 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -10,6 +10,10 @@
 #include <iostream>
 #include <limits>
 
+#if defined(_WIN32)
+#    include <malloc.h>  // for alloca
+#endif
+
 #include <OpenImageIO/half.h>
 
 #include <OpenImageIO/dassert.h>
@@ -18,6 +22,10 @@
 #include <OpenImageIO/imagebufalgo.h>
 #include <OpenImageIO/imagebufalgo_util.h>
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+#    include "imagebufalgo_hwy_pvt.h"
+#endif
+
 #include "imageio_pvt.h"
 
 
@@ -26,8 +34,8 @@ OIIO_NAMESPACE_3_1_BEGIN
 
 template<class Rtype, class Atype, class Btype>
 static bool
-add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+add_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -44,7 +52,8 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 template<class Rtype, class Atype>
 static bool
-add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -58,6 +67,298 @@ add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
 
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+// Native integer add using SaturatedAdd (scale-invariant, no float conversion)
+template<class T>
+static bool
+add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                        ROI roi, int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
+                            && ChannelsContiguous<T>(Av, nchannels)
+                            && ChannelsContiguous<T>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+
+            if (contig) {
+                // Native integer saturated add - much faster than float conversion!
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
+                                         [](auto d, auto a, auto b) {
+                                             return hn::SaturatedAdd(a, b);
+                                         });
+            } else {
+                // Scalar fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
+                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
+                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        // Saturating add in scalar
+                        int64_t sum = (int64_t)a_ptr[c] + (int64_t)b_ptr[c];
+                        if constexpr (std::is_unsigned_v<T>) {
+                            r_ptr[c] = (sum > std::numeric_limits<T>::max())
+                                           ? std::numeric_limits<T>::max()
+                                           : (T)sum;
+                        } else {
+                            r_ptr[c] = (sum > std::numeric_limits<T>::max())
+                                           ? std::numeric_limits<T>::max()
+                                       : (sum < std::numeric_limits<T>::min())
+                                           ? std::numeric_limits<T>::min()
+                                           : (T)sum;
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                // Process whole line as one vector stream
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Add(a, b);
+                                               });
+            } else {
+                // Process pixel by pixel (scalar fallback for strided channels)
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[c])
+                            + static_cast<float>(b_ptr[c]));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype>
+static bool
+add_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<Rtype, double>, double, float>;
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            std::byte* r_row       = PixelBase(Rv, roi.xbegin, y);
+            const std::byte* a_row = PixelBase(Av, roi.xbegin, y);
+            for (int x = roi.xbegin; x < roi.xend; ++x) {
+                const size_t xoff = static_cast<size_t>(x - roi.xbegin);
+                Rtype* r_ptr      = reinterpret_cast<Rtype*>(
+                    r_row + xoff * Rv.pixel_bytes);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(
+                    a_row + xoff * Av.pixel_bytes);
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    r_ptr[c] = (Rtype)((SimdType)a_ptr[c] + (SimdType)b[c]);
+                }
+            }
+        }
+    });
+    return true;
+}
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<class Rtype, class Atype, class Btype>
+static bool
+add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels()) {
+        // Use native integer path for scale-invariant add when all types match
+        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
+        constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                  && std::is_same_v<Atype, Btype>;
+        constexpr bool is_integer = std::is_integral_v<Rtype>;
+        if constexpr (all_same && is_integer) {
+            return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        }
+        return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
+#endif
+    return add_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+template<class Rtype, class Atype>
+static bool
+add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return add_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
+#endif
+    return add_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
+}
+
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+// Native integer sub using SaturatedSub (scale-invariant, no float conversion)
+template<class T>
+static bool
+sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                        ROI roi, int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
+                            && ChannelsContiguous<T>(Av, nchannels)
+                            && ChannelsContiguous<T>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+
+            if (contig) {
+                // Native integer saturated sub - much faster than float conversion!
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
+                                         [](auto d, auto a, auto b) {
+                                             return hn::SaturatedSub(a, b);
+                                         });
+            } else {
+                // Scalar fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
+                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
+                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        // Saturating sub in scalar
+                        if constexpr (std::is_unsigned_v<T>) {
+                            r_ptr[c] = (a_ptr[c] > b_ptr[c])
+                                           ? (a_ptr[c] - b_ptr[c])
+                                           : T(0);
+                        } else {
+                            int64_t diff = (int64_t)a_ptr[c]
+                                           - (int64_t)b_ptr[c];
+                            r_ptr[c] = (diff > std::numeric_limits<T>::max())
+                                           ? std::numeric_limits<T>::max()
+                                       : (diff < std::numeric_limits<T>::min())
+                                           ? std::numeric_limits<T>::min()
+                                           : (T)diff;
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Sub(a, b);
+                                               });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[c])
+                            - static_cast<float>(b_ptr[c]));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<class Rtype, class Atype, class Btype>
+static bool
+sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels()) {
+        // Use native integer path for scale-invariant sub when all types match
+        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
+        constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                  && std::is_same_v<Atype, Btype>;
+        constexpr bool is_integer = std::is_integral_v<Rtype>;
+        if constexpr (all_same && is_integer) {
+            return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        }
+        return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
+#endif
+    return sub_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
 static bool
 add_impl_deep(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
               int nthreads)
@@ -155,8 +456,8 @@ ImageBufAlgo::add(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads)
 
 template<class Rtype, class Atype, class Btype>
 static bool
-sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+sub_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
diff --git a/src/libOpenImageIO/imagebufalgo_hwy_pvt.h b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
new file mode 100644
index 0000000000..fe4c9b0d8a
--- /dev/null
+++ b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
@@ -0,0 +1,970 @@
+// Copyright Contributors to the OpenImageIO project.
+// SPDX-License-Identifier: Apache-2.0
+// https://github.com/AcademySoftwareFoundation/OpenImageIO
+
+#pragma once
+
+#include <OpenImageIO/half.h>
+#include <OpenImageIO/imagebuf.h>
+#include <OpenImageIO/imageio.h>
+#include <algorithm>
+#include <cstddef>
+#include <hwy/contrib/math/math-inl.h>
+#include <hwy/highway.h>
+#include <type_traits>
+
+OIIO_NAMESPACE_BEGIN
+
+// Alias for Highway's namespace for convenience
+namespace hn = hwy::HWY_NAMESPACE;
+
+// -----------------------------------------------------------------------
+// ImageBuf local pixel helpers (header-only)
+// -----------------------------------------------------------------------
+
+template<class ByteT> struct HwyLocalPixelsView {
+    ByteT* base           = nullptr;
+    size_t pixel_bytes    = 0;
+    size_t scanline_bytes = 0;
+    int xbegin            = 0;
+    int ybegin            = 0;
+    int nchannels         = 0;
+};
+
+inline HwyLocalPixelsView<std::byte>
+HwyPixels(ImageBuf& img)
+{
+    const ImageSpec& spec = img.spec();
+    return { reinterpret_cast<std::byte*>(img.localpixels()),
+             spec.pixel_bytes(),
+             spec.scanline_bytes(),
+             img.xbegin(),
+             img.ybegin(),
+             spec.nchannels };
+}
+
+inline HwyLocalPixelsView<const std::byte>
+HwyPixels(const ImageBuf& img)
+{
+    const ImageSpec& spec = img.spec();
+    return { reinterpret_cast<const std::byte*>(img.localpixels()),
+             spec.pixel_bytes(),
+             spec.scanline_bytes(),
+             img.xbegin(),
+             img.ybegin(),
+             spec.nchannels };
+}
+
+inline int
+RoiNChannels(const ROI& roi) noexcept
+{
+    return roi.chend - roi.chbegin;
+}
+
+template<class T, class ByteT>
+inline bool
+ChannelsContiguous(const HwyLocalPixelsView<ByteT>& v, int nchannels) noexcept
+{
+    return size_t(nchannels) * sizeof(T) == v.pixel_bytes;
+}
+
+template<class ByteT>
+inline ByteT*
+PixelBase(const HwyLocalPixelsView<ByteT>& v, int x, int y) noexcept
+{
+    return v.base + size_t(y - v.ybegin) * v.scanline_bytes
+           + size_t(x - v.xbegin) * v.pixel_bytes;
+}
+
+template<class T, class ByteT>
+inline std::conditional_t<std::is_const_v<ByteT>, const T*, T*>
+ChannelPtr(const HwyLocalPixelsView<ByteT>& v, int x, int y, int ch) noexcept
+{
+    using RetT = std::conditional_t<std::is_const_v<ByteT>, const T, T>;
+    return reinterpret_cast<RetT*>(PixelBase(v, x, y) + size_t(ch) * sizeof(T));
+}
+
+template<class T, class ByteT>
+inline std::conditional_t<std::is_const_v<ByteT>, const T*, T*>
+RoiRowPtr(const HwyLocalPixelsView<ByteT>& v, int y, const ROI& roi) noexcept
+{
+    return ChannelPtr<T>(v, roi.xbegin, y, roi.chbegin);
+}
+
+// -----------------------------------------------------------------------
+// Type Traits
+// -----------------------------------------------------------------------
+
+/// Determine the appropriate SIMD math type for a given result type.
+/// Promotes smaller types to float, keeps double as double.
+/// Note: uint32_t uses float (not double) for image processing performance.
+/// In OIIO, uint32 images are normalized to 0-1 range like uint8/uint16,
+/// so float precision (24-bit mantissa) is sufficient and much faster than double.
+template<typename T> struct SimdMathType {
+    using type = float;
+};
+template<> struct SimdMathType<double> {
+    using type = double;
+};
+
+// -----------------------------------------------------------------------
+// Load and Promote
+// -----------------------------------------------------------------------
+
+/// Load and promote source data to target SIMD type.
+/// Handles type conversions from various source formats (uint8_t, int8_t, uint16_t,
+/// int16_t, uint32_t, int32_t, uint64_t, int64_t, half, float, double) to the
+/// target SIMD computation type.
+/// @param d Highway descriptor tag defining the target SIMD type
+/// @param ptr Pointer to source data (may be unaligned)
+/// @return SIMD vector with promoted values
+template<class D, typename SrcT>
+inline auto
+LoadPromote(D d, const SrcT* ptr)
+{
+    using MathT = typename D::T;
+
+    if constexpr (std::is_same_v<SrcT, MathT>) {
+        return hn::Load(d, ptr);
+    } else if constexpr (std::is_same_v<SrcT, half>) {
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+        auto v16  = hn::Load(d16, reinterpret_cast<const T16*>(ptr));
+        return hn::PromoteTo(d, v16);
+    } else if constexpr (std::is_same_v<SrcT, uint8_t>) {
+        auto d_u8       = hn::Rebind<uint8_t, D>();
+        auto v_u8       = hn::Load(d_u8, ptr);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_u8)));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 255.0)));
+    } else if constexpr (std::is_same_v<SrcT, int8_t>) {
+        auto d_i8       = hn::Rebind<int8_t, D>();
+        auto v_i8       = hn::Load(d_i8, ptr);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_i8)));
+        // Normalize: map [-128, 127] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 127.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint16_t>) {
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::Load(d_u16, ptr);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_u16));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 65535.0)));
+    } else if constexpr (std::is_same_v<SrcT, int16_t>) {
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::Load(d_i16, ptr);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_i16));
+        // Normalize: map [-32768, 32767] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 32767.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint32_t>) {
+        // uint32 to float: Load, convert, and normalize to 0-1 range
+        auto d_u32      = hn::Rebind<uint32_t, D>();
+        auto v_u32      = hn::Load(d_u32, ptr);
+        auto v_promoted = hn::ConvertTo(d, v_u32);
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 4294967295.0)));
+    } else if constexpr (std::is_same_v<SrcT, int32_t>) {
+        // int32 to float: Load, convert, and normalize to approximately [-1.0, 1.0]
+        auto d_i32      = hn::Rebind<int32_t, D>();
+        auto v_i32      = hn::Load(d_i32, ptr);
+        auto v_promoted = hn::ConvertTo(d, v_i32);
+        // Normalize: map [-2147483648, 2147483647] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted,
+                              hn::Set(d, (MathT)(1.0 / 2147483647.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint64_t>) {
+        // uint64 to float: Load and demote to uint32, then convert
+        // Note: Precision loss expected for large values (>24 bits)
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::Load(d_u64, ptr);
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::DemoteTo(d_u32, v_u64);
+        return hn::ConvertTo(d, v_u32);
+    } else if constexpr (std::is_same_v<SrcT, int64_t>) {
+        // int64 to float: Load and demote to int32, then convert
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::Load(d_i64, ptr);
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::DemoteTo(d_i32, v_i64);
+        return hn::ConvertTo(d, v_i32);
+    } else {
+        return hn::Zero(d);
+    }
+}
+
+/// Load and promote partial source data to target SIMD type.
+/// Same as LoadPromote but handles partial vectors (< full lane count).
+/// @param d Highway descriptor tag defining the target SIMD type
+/// @param ptr Pointer to source data (may be unaligned)
+/// @param count Number of elements to load (must be <= lane count)
+/// @return SIMD vector with promoted values (undefined in unused lanes)
+template<class D, typename SrcT>
+inline auto
+LoadPromoteN(D d, const SrcT* ptr, size_t count)
+{
+    using MathT = typename D::T;
+
+    if constexpr (std::is_same_v<SrcT, MathT>) {
+        return hn::LoadN(d, ptr, count);
+    } else if constexpr (std::is_same_v<SrcT, half>) {
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+        auto v16  = hn::LoadN(d16, reinterpret_cast<const T16*>(ptr), count);
+        return hn::PromoteTo(d, v16);
+    } else if constexpr (std::is_same_v<SrcT, uint8_t>) {
+        auto d_u8       = hn::Rebind<uint8_t, D>();
+        auto v_u8       = hn::LoadN(d_u8, ptr, count);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_u8)));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 255.0)));
+    } else if constexpr (std::is_same_v<SrcT, int8_t>) {
+        auto d_i8       = hn::Rebind<int8_t, D>();
+        auto v_i8       = hn::LoadN(d_i8, ptr, count);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_i8)));
+        // Normalize: map [-128, 127] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 127.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint16_t>) {
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::LoadN(d_u16, ptr, count);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_u16));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 65535.0)));
+    } else if constexpr (std::is_same_v<SrcT, int16_t>) {
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::LoadN(d_i16, ptr, count);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_i16));
+        // Normalize: map [-32768, 32767] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 32767.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint32_t>) {
+        // uint32 to float: Load, convert, and normalize to 0-1 range
+        auto d_u32      = hn::Rebind<uint32_t, D>();
+        auto v_u32      = hn::LoadN(d_u32, ptr, count);
+        auto v_promoted = hn::ConvertTo(d, v_u32);
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 4294967295.0)));
+    } else if constexpr (std::is_same_v<SrcT, int32_t>) {
+        // int32 to float: Load, convert, and normalize to approximately [-1.0, 1.0]
+        auto d_i32      = hn::Rebind<int32_t, D>();
+        auto v_i32      = hn::LoadN(d_i32, ptr, count);
+        auto v_promoted = hn::ConvertTo(d, v_i32);
+        // Normalize: map [-2147483648, 2147483647] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted,
+                              hn::Set(d, (MathT)(1.0 / 2147483647.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint64_t>) {
+        // uint64 to float: Load and demote to uint32, then convert
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::LoadN(d_u64, ptr, count);
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::DemoteTo(d_u32, v_u64);
+        return hn::ConvertTo(d, v_u32);
+    } else if constexpr (std::is_same_v<SrcT, int64_t>) {
+        // int64 to float: Load and demote to int32, then convert
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::LoadN(d_i64, ptr, count);
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::DemoteTo(d_i32, v_i64);
+        return hn::ConvertTo(d, v_i32);
+    } else {
+        return hn::Zero(d);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Demote and Store
+// -----------------------------------------------------------------------
+
+/// Demote SIMD values and store to destination type.
+/// Handles type conversions from SIMD computation type (float/double) back to
+/// various destination formats with proper rounding and clamping for integer types.
+/// @param d Highway descriptor tag for the source SIMD type
+/// @param ptr Pointer to destination data (may be unaligned)
+/// @param v SIMD vector to demote and store
+template<class D, typename DstT, typename VecT>
+inline void
+DemoteStore(D d, DstT* ptr, VecT v)
+{
+    using MathT = typename D::T;
+    using VecD  = hn::Vec<D>;
+
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        hn::Store(v, d, ptr);
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        auto d16 = hn::Rebind<hwy::float16_t, D>();
+        auto v16 = hn::DemoteTo(d16, v);
+        hn::Store(v16, d16, reinterpret_cast<hwy::float16_t*>(ptr));
+    } else if constexpr (std::is_same_v<DstT, uint8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-255 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)255.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)255.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_u8  = hn::Rebind<uint8_t, D>();
+        auto v_u8  = hn::DemoteTo(d_u8, v_i16);
+        hn::Store(v_u8, d_u8, ptr);
+    } else if constexpr (std::is_same_v<DstT, int8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to -128-127 range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)127.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-128.0);
+        VecD v_max     = hn::Set(d, (MathT)127.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_i8  = hn::Rebind<int8_t, D>();
+        auto v_i8  = hn::DemoteTo(d_i8, v_i16);
+        hn::Store(v_i8, d_i8, ptr);
+    } else if constexpr (std::is_same_v<DstT, uint16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-65535 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)65535.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)65535.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::DemoteTo(d_u16, vi32);
+        hn::Store(v_u16, d_u16, ptr);
+    } else if constexpr (std::is_same_v<DstT, int16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to -32768-32767 range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)32767.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-32768.0);
+        VecD v_max     = hn::Set(d, (MathT)32767.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        hn::Store(v_i16, d_i16, ptr);
+    } else if constexpr (std::is_same_v<DstT, uint32_t>) {
+        // float -> uint32: Denormalize from 0-1 to 0-4294967295, round and convert
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-4294967295 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)4294967295.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)4294967295.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        hn::Store(v_u32, d_u32, ptr);
+    } else if constexpr (std::is_same_v<DstT, int32_t>) {
+        // float -> int32: Denormalize from approximately [-1.0, 1.0] to int32 range
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to -2147483648-2147483647 range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)2147483647.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-2147483648.0);
+        VecD v_max     = hn::Set(d, (MathT)2147483647.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_clamped);
+        hn::Store(v_i32, d_i32, ptr);
+    } else if constexpr (std::is_same_v<DstT, uint64_t>) {
+        // float -> uint64: Promote via uint32
+        // Note: Precision loss expected (float has only 24-bit mantissa)
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::PromoteTo(d_u64, v_u32);
+        hn::Store(v_u64, d_u64, ptr);
+    } else if constexpr (std::is_same_v<DstT, int64_t>) {
+        // float -> int64: Promote via int32
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_rounded);
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::PromoteTo(d_i64, v_i32);
+        hn::Store(v_i64, d_i64, ptr);
+    }
+}
+
+/// Demote and store partial SIMD values to destination type.
+/// Same as DemoteStore but handles partial vectors (< full lane count).
+/// @param d Highway descriptor tag for the source SIMD type
+/// @param ptr Pointer to destination data (may be unaligned)
+/// @param v SIMD vector to demote and store
+/// @param count Number of elements to store (must be <= lane count)
+template<class D, typename DstT, typename VecT>
+inline void
+DemoteStoreN(D d, DstT* ptr, VecT v, size_t count)
+{
+    using MathT = typename D::T;
+    using VecD  = hn::Vec<D>;
+
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        hn::StoreN(v, d, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        auto d16 = hn::Rebind<hwy::float16_t, D>();
+        auto v16 = hn::DemoteTo(d16, v);
+        hn::StoreN(v16, d16, reinterpret_cast<hwy::float16_t*>(ptr), count);
+    } else if constexpr (std::is_same_v<DstT, uint8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-255 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)255.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)255.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_u8  = hn::Rebind<uint8_t, D>();
+        auto v_u8  = hn::DemoteTo(d_u8, v_i16);
+        hn::StoreN(v_u8, d_u8, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to [-128, 127] range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)127.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-128.0);
+        VecD v_max     = hn::Set(d, (MathT)127.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_i8  = hn::Rebind<int8_t, D>();
+        auto v_i8  = hn::DemoteTo(d_i8, v_i16);
+        hn::StoreN(v_i8, d_i8, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, uint16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-65535 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)65535.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)65535.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::DemoteTo(d_u16, vi32);
+        hn::StoreN(v_u16, d_u16, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to [-32768, 32767] range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)32767.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-32768.0);
+        VecD v_max     = hn::Set(d, (MathT)32767.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        hn::StoreN(v_i16, d_i16, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, uint32_t>) {
+        // float -> uint32: Denormalize from 0-1 to 0-4294967295, round and convert
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-4294967295 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)4294967295.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)4294967295.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        hn::StoreN(v_u32, d_u32, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int32_t>) {
+        // float -> int32: Denormalize from approximately [-1.0, 1.0] range to [-2147483648, 2147483647] range
+        VecD v_val    = (VecD)v;
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)2147483647.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-2147483648.0);
+        VecD v_max     = hn::Set(d, (MathT)2147483647.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_clamped);
+        hn::StoreN(v_i32, d_i32, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, uint64_t>) {
+        // float -> uint64: Promote via uint32
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::PromoteTo(d_u64, v_u32);
+        hn::StoreN(v_u64, d_u64, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int64_t>) {
+        // float -> int64: Promote via int32
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_rounded);
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::PromoteTo(d_i64, v_i32);
+        hn::StoreN(v_i64, d_i64, ptr, count);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Native Integer Kernel Runners (No Type Conversion)
+// -----------------------------------------------------------------------
+
+/// Execute a unary SIMD operation on native integer arrays (no type promotion).
+/// For scale-invariant operations like abs, where int_op(a) == denorm(float_op(norm(a))).
+/// Much faster than promotion path - operates directly on integer SIMD vectors.
+/// @param r Destination array (same type as source)
+/// @param a Source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector) and returning result vector
+///           Example: [](auto d, auto va) { return hn::Abs(va); }
+template<typename T, typename OpFunc>
+inline void
+RunHwyUnaryNativeInt(T* r, const T* a, size_t n, OpFunc op)
+{
+    const hn::ScalableTag<T> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = hn::Load(d, a + x);
+        auto res = op(d, va);
+        hn::Store(res, d, r + x);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = hn::LoadN(d, a + x, remaining);
+        auto res = op(d, va);
+        hn::StoreN(res, d, r + x, remaining);
+    }
+}
+
+/// Execute a binary SIMD operation on native integer arrays (no type promotion).
+/// For scale-invariant operations like saturated add, min, max, where:
+/// int_op(a, b) == denorm(float_op(norm(a), norm(b))).
+/// Much faster than promotion path - no conversion overhead.
+/// @param r Destination array (same type as sources)
+/// @param a First source array
+/// @param b Second source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector_a, vector_b) and returning result
+///           Example: [](auto d, auto va, auto vb) { return hn::SaturatedAdd(va, vb); }
+template<typename T, typename OpFunc>
+inline void
+RunHwyBinaryNativeInt(T* r, const T* a, const T* b, size_t n, OpFunc op)
+{
+    const hn::ScalableTag<T> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = hn::Load(d, a + x);
+        auto vb  = hn::Load(d, b + x);
+        auto res = op(d, va, vb);
+        hn::Store(res, d, r + x);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = hn::LoadN(d, a + x, remaining);
+        auto vb  = hn::LoadN(d, b + x, remaining);
+        auto res = op(d, va, vb);
+        hn::StoreN(res, d, r + x, remaining);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Generic Kernel Runners (With Type Conversion)
+// -----------------------------------------------------------------------
+
+/// Execute a unary SIMD operation on an array.
+/// Processes array elements in SIMD batches, handling type promotion/demotion
+/// and partial vectors at the end.
+/// @param r Destination array
+/// @param a Source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector) and returning result vector
+///           Example: [](auto d, auto va) { return hn::Sqrt(va); }
+template<typename Rtype, typename Atype, typename OpFunc>
+inline void
+RunHwyUnaryCmd(Rtype* r, const Atype* a, size_t n, OpFunc op)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = LoadPromote(d, a + x);
+        auto res = op(d, va);
+        DemoteStore(d, r + x, res);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = LoadPromoteN(d, a + x, remaining);
+        auto res = op(d, va);
+        DemoteStoreN(d, r + x, res, remaining);
+    }
+}
+
+/// Execute a binary SIMD operation on two arrays.
+/// Processes array elements in SIMD batches, handling type promotion/demotion
+/// and partial vectors at the end.
+/// @param r Destination array
+/// @param a First source array
+/// @param b Second source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector_a, vector_b) and returning result
+///           Example: [](auto d, auto va, auto vb) { return hn::Add(va, vb); }
+template<typename Rtype, typename Atype, typename Btype, typename OpFunc>
+inline void
+RunHwyCmd(Rtype* r, const Atype* a, const Btype* b, size_t n, OpFunc op)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = LoadPromote(d, a + x);
+        auto vb  = LoadPromote(d, b + x);
+        auto res = op(d, va, vb);
+        DemoteStore(d, r + x, res);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = LoadPromoteN(d, a + x, remaining);
+        auto vb  = LoadPromoteN(d, b + x, remaining);
+        auto res = op(d, va, vb);
+        DemoteStoreN(d, r + x, res, remaining);
+    }
+}
+
+/// Execute a ternary SIMD operation on three arrays.
+/// Processes array elements in SIMD batches, handling type promotion/demotion
+/// and partial vectors at the end.
+/// @param r Destination array
+/// @param a First source array
+/// @param b Second source array
+/// @param c Third source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector_a, vector_b, vector_c) and returning result
+///           Example: [](auto d, auto va, auto vb, auto vc) { return hn::MulAdd(va, vb, vc); }
+template<typename Rtype, typename ABCtype, typename OpFunc>
+inline void
+RunHwyTernaryCmd(Rtype* r, const ABCtype* a, const ABCtype* b, const ABCtype* c,
+                 size_t n, OpFunc op)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = LoadPromote(d, a + x);
+        auto vb  = LoadPromote(d, b + x);
+        auto vc  = LoadPromote(d, c + x);
+        auto res = op(d, va, vb, vc);
+        DemoteStore(d, r + x, res);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = LoadPromoteN(d, a + x, remaining);
+        auto vb  = LoadPromoteN(d, b + x, remaining);
+        auto vc  = LoadPromoteN(d, c + x, remaining);
+        auto res = op(d, va, vb, vc);
+        DemoteStoreN(d, r + x, res, remaining);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Interleaved Channel Load/Store Helpers
+// -----------------------------------------------------------------------
+
+/// Load 4 interleaved channels (RGBA) with type promotion.
+/// For matching types, uses Highway's native LoadInterleaved4.
+/// For type promotion, loads and manually deinterleaves.
+/// @param d Highway descriptor tag for the target SIMD type
+/// @param ptr Pointer to interleaved RGBA data (R0,G0,B0,A0,R1,G1,B1,A1,...)
+/// @return Tuple of (R, G, B, A) SIMD vectors in promoted type
+template<class D, typename SrcT>
+inline auto
+LoadInterleaved4Promote(D d, const SrcT* ptr)
+{
+    using MathT = typename D::T;
+    using Vec   = hn::Vec<D>;
+
+    if constexpr (std::is_same_v<SrcT, MathT>) {
+        // No promotion needed - use Highway's optimized LoadInterleaved4
+        Vec r, g, b, a;
+        hn::LoadInterleaved4(d, ptr, r, g, b, a);
+        return std::make_tuple(r, g, b, a);
+    } else if constexpr (std::is_same_v<SrcT, half>) {
+        // Special handling for half type - convert through hwy::float16_t
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+
+        // Load interleaved half data as float16_t
+        hn::Vec<decltype(d16)> r16, g16, b16, a16;
+        hn::LoadInterleaved4(d16, reinterpret_cast<const T16*>(ptr), r16, g16,
+                             b16, a16);
+
+        // Promote to computation type
+        Vec r_vec = hn::PromoteTo(d, r16);
+        Vec g_vec = hn::PromoteTo(d, g16);
+        Vec b_vec = hn::PromoteTo(d, b16);
+        Vec a_vec = hn::PromoteTo(d, a16);
+
+        return std::make_tuple(r_vec, g_vec, b_vec, a_vec);
+    } else {
+        // Generic type promotion - deinterleave manually with normalization
+        const size_t N = hn::Lanes(d);
+        SrcT r_src[hn::MaxLanes(d)];
+        SrcT g_src[hn::MaxLanes(d)];
+        SrcT b_src[hn::MaxLanes(d)];
+        SrcT a_src[hn::MaxLanes(d)];
+
+        for (size_t i = 0; i < N; ++i) {
+            r_src[i] = ptr[i * 4 + 0];
+            g_src[i] = ptr[i * 4 + 1];
+            b_src[i] = ptr[i * 4 + 2];
+            a_src[i] = ptr[i * 4 + 3];
+        }
+
+        // Use LoadPromote for proper normalization of integer types
+        auto r_vec = LoadPromote(d, r_src);
+        auto g_vec = LoadPromote(d, g_src);
+        auto b_vec = LoadPromote(d, b_src);
+        auto a_vec = LoadPromote(d, a_src);
+
+        return std::make_tuple(r_vec, g_vec, b_vec, a_vec);
+    }
+}
+
+/// Store 4 interleaved channels (RGBA) with type demotion.
+/// For matching types, uses Highway's native StoreInterleaved4.
+/// For type demotion, manually interleaves and stores.
+/// @param d Highway descriptor tag for the source SIMD type
+/// @param ptr Pointer to destination interleaved RGBA data
+/// @param r Red channel SIMD vector
+/// @param g Green channel SIMD vector
+/// @param b Blue channel SIMD vector
+/// @param a Alpha channel SIMD vector
+template<class D, typename DstT, typename VecT>
+inline void
+StoreInterleaved4Demote(D d, DstT* ptr, VecT r, VecT g, VecT b, VecT a)
+{
+    using MathT = typename D::T;
+
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        // No demotion needed - use Highway's optimized StoreInterleaved4
+        hn::StoreInterleaved4(r, g, b, a, d, ptr);
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        // Special handling for half type - convert through hwy::float16_t
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+
+        // Demote to float16_t
+        auto r16 = hn::DemoteTo(d16, r);
+        auto g16 = hn::DemoteTo(d16, g);
+        auto b16 = hn::DemoteTo(d16, b);
+        auto a16 = hn::DemoteTo(d16, a);
+
+        // Store interleaved float16_t data
+        hn::StoreInterleaved4(r16, g16, b16, a16, d16,
+                              reinterpret_cast<T16*>(ptr));
+    } else {
+        // Generic type demotion - use DemoteStore for each channel then interleave
+        const size_t N = hn::Lanes(d);
+
+        // Temporary arrays for demoted values
+        DstT r_demoted[hn::MaxLanes(d)];
+        DstT g_demoted[hn::MaxLanes(d)];
+        DstT b_demoted[hn::MaxLanes(d)];
+        DstT a_demoted[hn::MaxLanes(d)];
+
+        // Use DemoteStoreN to properly denormalize integer types
+        DemoteStoreN(d, r_demoted, r, N);
+        DemoteStoreN(d, g_demoted, g, N);
+        DemoteStoreN(d, b_demoted, b, N);
+        DemoteStoreN(d, a_demoted, a, N);
+
+        // Interleave the demoted values
+        for (size_t i = 0; i < N; ++i) {
+            ptr[i * 4 + 0] = r_demoted[i];
+            ptr[i * 4 + 1] = g_demoted[i];
+            ptr[i * 4 + 2] = b_demoted[i];
+            ptr[i * 4 + 3] = a_demoted[i];
+        }
+    }
+}
+
+// -----------------------------------------------------------------------
+// Rangecompress/Rangeexpand SIMD Kernels
+// -----------------------------------------------------------------------
+
+/// Apply rangecompress formula to a SIMD vector.
+/// Formula (courtesy Sony Pictures Imageworks):
+///   if (|x| <= 0.18) return x
+///   else return copysign(a + b * log(c * |x| + 1), x)
+/// where a = -0.545768857, b = 0.183516696, c = 284.357788
+/// @param d Highway descriptor tag
+/// @param x Input SIMD vector
+/// @return Compressed SIMD vector
+template<class D, typename VecT>
+inline auto
+rangecompress_simd(D d, VecT x)
+{
+    using T = typename D::T;
+
+    // Constants from Sony Pictures Imageworks
+    constexpr T x1 = static_cast<T>(0.18);
+    constexpr T a  = static_cast<T>(-0.54576885700225830078);
+    constexpr T b  = static_cast<T>(0.18351669609546661377);
+    constexpr T c  = static_cast<T>(284.3577880859375);
+
+    auto abs_x            = hn::Abs(x);
+    auto mask_passthrough = hn::Le(abs_x, hn::Set(d, x1));
+
+    // compressed = a + b * log(c * |x| + 1.0)
+    auto c_vec      = hn::Set(d, c);
+    auto one        = hn::Set(d, static_cast<T>(1.0));
+    auto temp       = hn::MulAdd(c_vec, abs_x, one);  // c * |x| + 1.0
+    auto log_val    = hn::Log(d, temp);
+    auto b_vec      = hn::Set(d, b);
+    auto a_vec      = hn::Set(d, a);
+    auto compressed = hn::MulAdd(b_vec, log_val, a_vec);  // a + b * log
+
+    // Apply sign of original x
+    auto result = hn::CopySign(compressed, x);
+
+    // If |x| <= x1, return x; else return compressed
+    return hn::IfThenElse(mask_passthrough, x, result);
+}
+
+/// Apply rangeexpand formula to a SIMD vector (inverse of rangecompress).
+/// Formula:
+///   if (|y| <= 0.18) return y
+///   else x = exp((|y| - a) / b); x = (x - 1) / c
+///        if x < 0.18 then x = (-x_intermediate - 1) / c
+///        return copysign(x, y)
+/// @param d Highway descriptor tag
+/// @param y Input SIMD vector (compressed values)
+/// @return Expanded SIMD vector
+template<class D, typename VecT>
+inline auto
+rangeexpand_simd(D d, VecT y)
+{
+    using T = typename D::T;
+
+    // Constants (same as rangecompress)
+    constexpr T x1 = static_cast<T>(0.18);
+    constexpr T a  = static_cast<T>(-0.54576885700225830078);
+    constexpr T b  = static_cast<T>(0.18351669609546661377);
+    constexpr T c  = static_cast<T>(284.3577880859375);
+
+    auto abs_y            = hn::Abs(y);
+    auto mask_passthrough = hn::Le(abs_y, hn::Set(d, x1));
+
+    // x_intermediate = exp((|y| - a) / b)
+    auto a_vec        = hn::Set(d, a);
+    auto b_vec        = hn::Set(d, b);
+    auto intermediate = hn::Div(hn::Sub(abs_y, a_vec), b_vec);  // (|y| - a) / b
+    auto x_intermediate = hn::Exp(d, intermediate);
+
+    // x = (x_intermediate - 1.0) / c
+    auto one   = hn::Set(d, static_cast<T>(1.0));
+    auto c_vec = hn::Set(d, c);
+    auto x     = hn::Div(hn::Sub(x_intermediate, one), c_vec);
+
+    // If x < x1, use alternate solution: (-x_intermediate - 1.0) / c
+    auto mask_alternate = hn::Lt(x, hn::Set(d, x1));
+    auto x_alternate    = hn::Div(hn::Sub(hn::Neg(x_intermediate), one), c_vec);
+    x                   = hn::IfThenElse(mask_alternate, x_alternate, x);
+
+    // Apply sign of input y
+    auto result = hn::CopySign(x, y);
+
+    return hn::IfThenElse(mask_passthrough, y, result);
+}
+
+OIIO_NAMESPACE_END
diff --git a/src/libOpenImageIO/imagebufalgo_mad.cpp b/src/libOpenImageIO/imagebufalgo_mad.cpp
index 5707fcd6ac..d8038a74a7 100644
--- a/src/libOpenImageIO/imagebufalgo_mad.cpp
+++ b/src/libOpenImageIO/imagebufalgo_mad.cpp
@@ -6,12 +6,19 @@
 #include <iostream>
 #include <limits>
 
+#if defined(_WIN32)
+#    include <malloc.h>  // for alloca
+#endif
+
 #include <OpenImageIO/dassert.h>
 #include <OpenImageIO/half.h>
 #include <OpenImageIO/imagebuf.h>
 #include <OpenImageIO/imagebufalgo.h>
 #include <OpenImageIO/imagebufalgo_util.h>
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+#    include "imagebufalgo_hwy_pvt.h"
+#endif
 #include "imageio_pvt.h"
 
 
@@ -21,66 +28,91 @@ OIIO_NAMESPACE_3_1_BEGIN
 
 template<class Rtype, class ABCtype>
 static bool
-mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C,
-         ROI roi, int nthreads)
+mad_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                const ImageBuf& C, ROI roi, int nthreads)
+{
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        ImageBuf::Iterator<Rtype> r(R, roi);
+        ImageBuf::ConstIterator<ABCtype> a(A, roi);
+        ImageBuf::ConstIterator<ABCtype> b(B, roi);
+        ImageBuf::ConstIterator<ABCtype> c(C, roi);
+        for (; !r.done(); ++r, ++a, ++b, ++c) {
+            for (int ch = roi.chbegin; ch < roi.chend; ++ch)
+                r[ch] = a[ch] * b[ch] + c[ch];
+        }
+    });
+    return true;
+}
+
+
+
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+template<class Rtype, class ABCtype>
+static bool
+mad_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+             const ImageBuf& C, ROI roi, int nthreads)
 {
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    auto Cv = HwyPixels(C);
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        if ((std::is_same<Rtype, float>::value
-             || std::is_same<Rtype, half>::value)
-            && (std::is_same<ABCtype, float>::value
-                || std::is_same<ABCtype, half>::value)
-            // && R.localpixels() // has to be, because it's writable
-            && A.localpixels() && B.localpixels()
-            && C.localpixels()
-            // && R.contains_roi(roi)  // has to be, because IBAPrep
-            && A.contains_roi(roi) && B.contains_roi(roi) && C.contains_roi(roi)
-            && roi.chbegin == 0 && roi.chend == R.nchannels()
-            && roi.chend == A.nchannels() && roi.chend == B.nchannels()
-            && roi.chend == C.nchannels()) {
-            // Special case when all inputs are either float or half, with in-
-            // memory contiguous data and we're operating on the full channel
-            // range: skip iterators: For these circumstances, we can operate on
-            // the raw memory very efficiently. Otherwise, we will need the
-            // magic of the the Iterators (and pay the price).
-            int nxvalues = roi.width() * R.nchannels();
-            for (int z = roi.zbegin; z < roi.zend; ++z)
-                for (int y = roi.ybegin; y < roi.yend; ++y) {
-                    Rtype* rraw = (Rtype*)R.pixeladdr(roi.xbegin, y, z);
-                    const ABCtype* araw
-                        = (const ABCtype*)A.pixeladdr(roi.xbegin, y, z);
-                    const ABCtype* braw
-                        = (const ABCtype*)B.pixeladdr(roi.xbegin, y, z);
-                    const ABCtype* craw
-                        = (const ABCtype*)C.pixeladdr(roi.xbegin, y, z);
-                    OIIO_DASSERT(araw && braw && craw);
-                    // The straightforward loop auto-vectorizes very well,
-                    // there's no benefit to using explicit SIMD here.
-                    for (int x = 0; x < nxvalues; ++x)
-                        rraw[x] = araw[x] * braw[x] + craw[x];
-                    // But if you did want to explicitly vectorize, this is
-                    // how it would look:
-                    // int simdend = nxvalues & (~3); // how many float4's?
-                    // for (int x = 0; x < simdend; x += 4) {
-                    //     simd::float4 a_simd(araw+x), b_simd(braw+x), c_simd(craw+x);
-                    //     simd::float4 r_simd = a_simd * b_simd + c_simd;
-                    //     r_simd.store (rraw+x);
-                    // }
-                    // for (int x = simdend; x < nxvalues; ++x)
-                    //     rraw[x] = araw[x] * braw[x] + craw[x];
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<ABCtype>(Av, nchannels)
+                            && ChannelsContiguous<ABCtype>(Bv, nchannels)
+                            && ChannelsContiguous<ABCtype>(Cv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi);
+            const ABCtype* a_row = RoiRowPtr<ABCtype>(Av, y, roi);
+            const ABCtype* b_row = RoiRowPtr<ABCtype>(Bv, y, roi);
+            const ABCtype* c_row = RoiRowPtr<ABCtype>(Cv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                // Use Highway SIMD for a*b+c (fused multiply-add)
+                RunHwyTernaryCmd<Rtype, ABCtype>(r_row, a_row, b_row, c_row, n,
+                                                 [](auto d, auto a, auto b,
+                                                    auto c) {
+                                                     return hn::MulAdd(a, b, c);
+                                                 });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const ABCtype* a_ptr = ChannelPtr<ABCtype>(Av, x, y,
+                                                               roi.chbegin);
+                    const ABCtype* b_ptr = ChannelPtr<ABCtype>(Bv, x, y,
+                                                               roi.chbegin);
+                    const ABCtype* c_ptr = ChannelPtr<ABCtype>(Cv, x, y,
+                                                               roi.chbegin);
+                    for (int ch = 0; ch < nchannels; ++ch) {
+                        r_ptr[ch] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[ch])
+                                * static_cast<float>(b_ptr[ch])
+                            + static_cast<float>(c_ptr[ch]));
+                    }
                 }
-        } else {
-            ImageBuf::Iterator<Rtype> r(R, roi);
-            ImageBuf::ConstIterator<ABCtype> a(A, roi);
-            ImageBuf::ConstIterator<ABCtype> b(B, roi);
-            ImageBuf::ConstIterator<ABCtype> c(C, roi);
-            for (; !r.done(); ++r, ++a, ++b, ++c) {
-                for (int ch = roi.chbegin; ch < roi.chend; ++ch)
-                    r[ch] = a[ch] * b[ch] + c[ch];
             }
         }
     });
     return true;
 }
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<class Rtype, class ABCtype>
+static bool
+mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C,
+         ROI roi, int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels() && C.localpixels())
+        return mad_impl_hwy<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
+#endif
+    return mad_impl_scalar<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
+}
 
 
diff --git a/src/libOpenImageIO/imagebufalgo_muldiv.cpp b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
index 4fa1a6cba0..45c166907a 100644
--- a/src/libOpenImageIO/imagebufalgo_muldiv.cpp
+++ b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
@@ -10,8 +10,15 @@
 #include <iostream>
 #include <limits>
 
+#if defined(_WIN32)
+#    include <malloc.h>  // for alloca
+#endif
+
 #include <OpenImageIO/half.h>
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+#    include "imagebufalgo_hwy_pvt.h"
+#endif
 #include <OpenImageIO/dassert.h>
 #include <OpenImageIO/deepdata.h>
 #include <OpenImageIO/imagebuf.h>
@@ -86,8 +93,8 @@ ImageBufAlgo::scale(const ImageBuf& A, const ImageBuf& B, KWArgs options,
 
 template<class Rtype, class Atype, class Btype>
 static bool
-mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+mul_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -104,7 +111,8 @@ mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 template<class Rtype, class Atype>
 static bool
-mul_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+mul_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::ConstIterator<Atype> a(A, roi);
@@ -117,6 +125,106 @@ mul_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
 
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+template<class Rtype, class Atype, class Btype>
+static bool
+mul_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Mul(a, b);
+                                               });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[c])
+                            * static_cast<float>(b_ptr[c]));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype>
+static bool
+mul_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<Rtype, double>, double, float>;
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            std::byte* r_row       = PixelBase(Rv, roi.xbegin, y);
+            const std::byte* a_row = PixelBase(Av, roi.xbegin, y);
+            for (int x = roi.xbegin; x < roi.xend; ++x) {
+                const size_t xoff = static_cast<size_t>(x - roi.xbegin);
+                Rtype* r_ptr      = reinterpret_cast<Rtype*>(
+                    r_row + xoff * Rv.pixel_bytes);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(
+                    a_row + xoff * Av.pixel_bytes);
+
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    r_ptr[c] = (Rtype)((SimdType)a_ptr[c] * (SimdType)b[c]);
+                }
+            }
+        }
+    });
+    return true;
+}
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<class Rtype, class Atype, class Btype>
+static bool
+mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels())
+        return mul_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+#endif
+    return mul_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+template<class Rtype, class Atype>
+static bool
+mul_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return mul_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
+#endif
+    return mul_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
+}
+
 static bool
 mul_impl_deep(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
               int nthreads)
@@ -198,8 +306,8 @@ ImageBufAlgo::mul(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads)
 
 template<class Rtype, class Atype, class Btype>
 static bool
-div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+div_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -216,6 +324,73 @@ div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+template<class Rtype, class Atype, class Btype>
+static bool
+div_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(
+                    r_row, a_row, b_row, n, [](auto d, auto a, auto b) {
+                        // Check for zero division: if b == 0, return 0
+                        auto zero = hn::Zero(d);
+                        auto mask = hn::Eq(b, zero);
+                        return hn::IfThenElse(mask, zero, hn::Div(a, b));
+                    });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        float v  = static_cast<float>(b_ptr[c]);
+                        r_ptr[c] = (v == 0.0f)
+                                       ? static_cast<Rtype>(0.0f)
+                                       : static_cast<Rtype>(
+                                             static_cast<float>(a_ptr[c]) / v);
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<class Rtype, class Atype, class Btype>
+static bool
+div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels())
+        return div_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+#endif
+    return div_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+
+
 bool
 ImageBufAlgo::div(ImageBuf& dst, Image_or_Const A_, Image_or_Const B_, ROI roi,
                   int nthreads)
diff --git a/src/libOpenImageIO/imagebufalgo_xform.cpp b/src/libOpenImageIO/imagebufalgo_xform.cpp
index 0abbb1ace8..ca3fa3e960 100644
--- a/src/libOpenImageIO/imagebufalgo_xform.cpp
+++ b/src/libOpenImageIO/imagebufalgo_xform.cpp
@@ -21,6 +21,10 @@
 
 #include <Imath/ImathBox.h>
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+#    include "imagebufalgo_hwy_pvt.h"
+#endif
+
 OIIO_NAMESPACE_3_1_BEGIN
 
 
@@ -932,11 +936,7 @@ ImageBufAlgo::fit(ImageBuf& dst, const ImageBuf& src, KWArgs options, ROI roi,
     OIIO::pvt::LoggedTimer logtime("IBA::fit");
 
     static const ustring recognized[] = {
-        filtername_us,
-        filterwidth_us,
-        filterptr_us,
-        fillmode_us,
-        exact_us,
+        filtername_us, filterwidth_us, filterptr_us, fillmode_us, exact_us,
 #if 0 /* Not currently recognized */
         wrap_us,
         edgeclamp_us,
@@ -1072,15 +1072,39 @@ ImageBufAlgo::fit(const ImageBuf& src, KWArgs options, ROI roi, int nthreads)
 
 template<typename DSTTYPE, typename SRCTYPE>
 static bool
-resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
-          int nthreads)
+resample_scalar(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+                int nthreads)
 {
+    // This operates just like the internals of ImageBuf::interppixel(), but
+    // reuses the provided iterator to avoid the overhead of constructing a new
+    // one each time. This speeds it up by 20x! The iterator `it` must already
+    // be associated with `img`, but it need not be positioned correctly.
+    auto interppixel =
+        [](const ImageBuf& img, ImageBuf::ConstIterator<SRCTYPE>& it, float x,
+           float y, span<float> pixel, ImageBuf::WrapMode wrap) -> bool {
+        int n             = std::min(int(pixel.size()), img.spec().nchannels);
+        float* localpixel = OIIO_ALLOCA(float, n * 4);
+        float* p[4]       = { localpixel, localpixel + n, localpixel + 2 * n,
+                              localpixel + 3 * n };
+        x -= 0.5f;
+        y -= 0.5f;
+        int xtexel, ytexel;
+        float xfrac, yfrac;
+        xfrac = floorfrac(x, &xtexel);
+        yfrac = floorfrac(y, &ytexel);
+        it.rerange(xtexel, xtexel + 2, ytexel, ytexel + 2, 0, 1, wrap);
+        for (int i = 0; i < 4; ++i, ++it)
+            for (int c = 0; c < n; ++c)
+                p[i][c] = it[c];  //NOSONAR
+        bilerp(p[0], p[1], p[2], p[3], xfrac, yfrac, n, pixel.data());
+        return true;
+    };
+
     OIIO_ASSERT(src.deep() == dst.deep());
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         const ImageSpec& srcspec(src.spec());
         const ImageSpec& dstspec(dst.spec());
         int nchannels = src.nchannels();
-        bool deep     = src.deep();
 
         // Local copies of the source image window, converted to float
         float srcfx = srcspec.full_x;
@@ -1109,25 +1133,10 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
                 float s      = (x - dstfx + 0.5f) * dstpixelwidth;
                 float src_xf = srcfx + s * srcfw;
                 int src_x    = ifloor(src_xf);
-                if (deep) {
-                    srcpel.pos(src_x, src_y, 0);
-                    int nsamps = srcpel.deep_samples();
-                    OIIO_DASSERT(nsamps == out.deep_samples());
-                    if (!nsamps || nsamps != out.deep_samples())
-                        continue;
-                    for (int c = 0; c < nchannels; ++c) {
-                        if (dstspec.channelformat(c) == TypeDesc::UINT32)
-                            for (int samp = 0; samp < nsamps; ++samp)
-                                out.set_deep_value(
-                                    c, samp, srcpel.deep_value_uint(c, samp));
-                        else
-                            for (int samp = 0; samp < nsamps; ++samp)
-                                out.set_deep_value(c, samp,
-                                                   srcpel.deep_value(c, samp));
-                    }
-                } else if (interpolate) {
+                if (interpolate) {
                     // Non-deep image, bilinearly interpolate
-                    src.interppixel(src_xf, src_yf, pel, ImageBuf::WrapClamp);
+                    interppixel(src, srcpel, src_xf, src_yf, pel,
+                                ImageBuf::WrapClamp);
                     for (int c = roi.chbegin; c < roi.chend; ++c)
                         out[c] = pel[c];
                 } else {
@@ -1142,6 +1151,265 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
     return true;
 }
 
+static bool
+resample_deep(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+              int nthreads)
+{
+    // If it's deep, figure out the sample allocations first, because
+    // it's not thread-safe to do that simultaneously with copying the
+    // values.
+    const ImageSpec& srcspec(src.spec());
+    const ImageSpec& dstspec(dst.spec());
+    float srcfx          = srcspec.full_x;
+    float srcfy          = srcspec.full_y;
+    float srcfw          = srcspec.full_width;
+    float srcfh          = srcspec.full_height;
+    float dstpixelwidth  = 1.0f / dstspec.full_width;
+    float dstpixelheight = 1.0f / dstspec.full_height;
+    ImageBuf::ConstIterator<float> srcpel(src, roi);
+    ImageBuf::Iterator<float> dstpel(dst, roi);
+    for (; !dstpel.done(); ++dstpel, ++srcpel) {
+        float s   = (dstpel.x() - dstspec.full_x + 0.5f) * dstpixelwidth;
+        float t   = (dstpel.y() - dstspec.full_y + 0.5f) * dstpixelheight;
+        int src_y = ifloor(srcfy + t * srcfh);
+        int src_x = ifloor(srcfx + s * srcfw);
+        srcpel.pos(src_x, src_y, 0);
+        dstpel.set_deep_samples(srcpel.deep_samples());
+    }
+
+    OIIO_ASSERT(src.deep() == dst.deep());
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const ImageSpec& srcspec(src.spec());
+        const ImageSpec& dstspec(dst.spec());
+        int nchannels = src.nchannels();
+
+        // Local copies of the source image window, converted to float
+        float srcfx = srcspec.full_x;
+        float srcfy = srcspec.full_y;
+        float srcfw = srcspec.full_width;
+        float srcfh = srcspec.full_height;
+
+        float dstfx          = dstspec.full_x;
+        float dstfy          = dstspec.full_y;
+        float dstfw          = dstspec.full_width;
+        float dstfh          = dstspec.full_height;
+        float dstpixelwidth  = 1.0f / dstfw;
+        float dstpixelheight = 1.0f / dstfh;
+
+        ImageBuf::Iterator<float> out(dst, roi);
+        ImageBuf::ConstIterator<float> srcpel(src);
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            // s,t are NDC space
+            float t = (y - dstfy + 0.5f) * dstpixelheight;
+            // src_xf, src_xf are image space float coordinates
+            float src_yf = srcfy + t * srcfh;
+            // src_x, src_y are image space integer coordinates of the floor
+            int src_y = ifloor(src_yf);
+            for (int x = roi.xbegin; x < roi.xend; ++x, ++out) {
+                float s      = (x - dstfx + 0.5f) * dstpixelwidth;
+                float src_xf = srcfx + s * srcfw;
+                int src_x    = ifloor(src_xf);
+                srcpel.pos(src_x, src_y, 0);
+                int nsamps = srcpel.deep_samples();
+                OIIO_DASSERT(nsamps == out.deep_samples());
+                if (!nsamps || nsamps != out.deep_samples())
+                    continue;
+                for (int c = 0; c < nchannels; ++c) {
+                    if (dstspec.channelformat(c) == TypeDesc::UINT32)
+                        for (int samp = 0; samp < nsamps; ++samp)
+                            out.set_deep_value(c, samp,
+                                               srcpel.deep_value_uint(c, samp));
+                    else
+                        for (int samp = 0; samp < nsamps; ++samp)
+                            out.set_deep_value(c, samp,
+                                               srcpel.deep_value(c, samp));
+                }
+            }
+        }
+    });
+
+    return true;
+}
+
+
+
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+template<typename DSTTYPE, typename SRCTYPE>
+static bool
+resample_hwy(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<DSTTYPE, double>, double, float>;
+    using D      = hn::ScalableTag<SimdType>;
+    using Rebind = hn::Rebind<int32_t, D>;
+
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const ImageSpec& srcspec(src.spec());
+        const ImageSpec& dstspec(dst.spec());
+
+        // Local copies of the source image window, converted to SimdType
+        float srcfx = srcspec.full_x;
+        float srcfy = srcspec.full_y;
+        float srcfw = srcspec.full_width;
+        float srcfh = srcspec.full_height;
+
+        float dstfx          = dstspec.full_x;
+        float dstfy          = dstspec.full_y;
+        float dstfw          = dstspec.full_width;
+        float dstfh          = dstspec.full_height;
+        float dstpixelwidth  = 1.0f / dstfw;
+        float dstpixelheight = 1.0f / dstfh;
+
+        const size_t src_scanline_bytes = srcspec.scanline_bytes();
+        const size_t dst_scanline_bytes = dstspec.scanline_bytes();
+        const size_t src_pixel_bytes    = srcspec.pixel_bytes();
+        const size_t dst_pixel_bytes    = dstspec.pixel_bytes();
+
+        const uint8_t* src_base = (const uint8_t*)src.localpixels();
+        uint8_t* dst_base       = (uint8_t*)dst.localpixels();
+
+        D d;
+        Rebind d_i32;
+        int N = hn::Lanes(d);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            float t      = (y - dstfy + 0.5f) * dstpixelheight;
+            float src_yf = srcfy + t * srcfh;
+            // Pixel-center convention: subtract 0.5 before interpolation
+            src_yf -= 0.5f;
+            int src_y   = ifloor(src_yf);
+            SimdType fy = (SimdType)(src_yf - src_y);
+
+            // Clamp Y to valid range
+            int src_y_clamped = clamp(src_y, src.ybegin(), src.yend() - 1);
+            // Neighbor Y (for bilinear)
+            int src_y_next_clamped = clamp(src_y + 1, src.ybegin(),
+                                           src.yend() - 1);
+
+            // Pre-calculate row pointers
+            const uint8_t* row0 = src_base
+                                  + (src_y_clamped - src.ybegin())
+                                        * src_scanline_bytes;
+            const uint8_t* row1 = src_base
+                                  + (src_y_next_clamped - src.ybegin())
+                                        * src_scanline_bytes;
+
+            uint8_t* dst_row = dst_base
+                               + (y - dst.ybegin()) * dst_scanline_bytes;
+
+            for (int x = roi.xbegin; x < roi.xend; x += N) {
+                // Handle remaining pixels if less than N
+                int n = std::min(N, roi.xend - x);
+
+                // Compute src_xf for N pixels
+                auto idx_i32 = hn::Iota(d_i32, (float)x);
+
+                auto x_simd     = hn::ConvertTo(d, idx_i32);
+                auto s          = hn::Mul(hn::Sub(hn::Add(x_simd,
+                                                          hn::Set(d, (SimdType)0.5f)),
+                                                  hn::Set(d, (SimdType)dstfx)),
+                                          hn::Set(d, (SimdType)dstpixelwidth));
+                auto src_xf_vec = hn::MulAdd(s, hn::Set(d, (SimdType)srcfw),
+                                             hn::Set(d, (SimdType)srcfx));
+                // Pixel-center convention: subtract 0.5 before interpolation
+                src_xf_vec = hn::Sub(src_xf_vec, hn::Set(d, (SimdType)0.5f));
+
+                auto src_x_vec = hn::Floor(src_xf_vec);
+                auto fx        = hn::Sub(src_xf_vec, src_x_vec);
+                auto ix        = hn::ConvertTo(d_i32, src_x_vec);
+
+                // Clamp X
+                auto min_x = hn::Set(d_i32, src.xbegin());
+                auto max_x = hn::Set(d_i32, src.xend() - 1);
+                auto ix0   = hn::Min(hn::Max(ix, min_x), max_x);
+                auto ix1
+                    = hn::Min(hn::Max(hn::Add(ix, hn::Set(d_i32, 1)), min_x),
+                              max_x);
+
+                // Adjust to 0-based offset from buffer start
+                auto x_offset  = hn::Sub(ix0, min_x);
+                auto x1_offset = hn::Sub(ix1, min_x);
+
+                // Loop over channels
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    // Manual gather loop for now to be safe with types and offsets
+                    SimdType v00_arr[16], v01_arr[16], v10_arr[16], v11_arr[16];
+                    int32_t x0_arr[16], x1_arr[16];
+                    hn::Store(x_offset, d_i32, x0_arr);
+                    hn::Store(x1_offset, d_i32, x1_arr);
+
+                    for (int i = 0; i < n; ++i) {
+                        size_t off0 = (size_t)x0_arr[i] * src_pixel_bytes
+                                      + (size_t)c * sizeof(SRCTYPE);
+                        size_t off1 = (size_t)x1_arr[i] * src_pixel_bytes
+                                      + (size_t)c * sizeof(SRCTYPE);
+
+                        auto load_val = [](const uint8_t* ptr) -> SimdType {
+                            return (SimdType)(*(const SRCTYPE*)ptr);
+                        };
+
+                        v00_arr[i] = load_val(row0 + off0);
+                        v01_arr[i] = load_val(row0 + off1);
+                        v10_arr[i] = load_val(row1 + off0);
+                        v11_arr[i] = load_val(row1 + off1);
+                    }
+
+                    auto val00 = hn::Load(d, v00_arr);
+                    auto val01 = hn::Load(d, v01_arr);
+                    auto val10 = hn::Load(d, v10_arr);
+                    auto val11 = hn::Load(d, v11_arr);
+
+                    // Bilinear Interpolation
+                    auto one = hn::Set(d, (SimdType)1.0f);
+                    auto w00 = hn::Mul(hn::Sub(one, fx),
+                                       hn::Sub(one, hn::Set(d, fy)));
+                    auto w01 = hn::Mul(fx, hn::Sub(one, hn::Set(d, fy)));
+                    auto w10 = hn::Mul(hn::Sub(one, fx), hn::Set(d, fy));
+                    auto w11 = hn::Mul(fx, hn::Set(d, fy));
+
+                    // Use FMA (Fused Multiply-Add) for better performance
+                    auto res = hn::Mul(val00, w00);
+                    res      = hn::MulAdd(val01, w01,
+                                          res);  // res = res + val01 * w01
+                    res      = hn::MulAdd(val10, w10,
+                                          res);  // res = res + val10 * w10
+                    res      = hn::MulAdd(val11, w11,
+                                          res);  // res = res + val11 * w11
+
+                    // Store
+                    SimdType res_arr[16];
+                    hn::Store(res, d, res_arr);
+                    for (int i = 0; i < n; ++i) {
+                        DSTTYPE* dptr
+                            = (DSTTYPE*)(dst_row
+                                         + (size_t)(x - roi.xbegin + i)
+                                               * dst_pixel_bytes
+                                         + (size_t)c * sizeof(DSTTYPE));
+                        *dptr = (DSTTYPE)res_arr[i];
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<typename DSTTYPE, typename SRCTYPE>
+static bool
+resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+          int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && dst.localpixels() && src.localpixels())
+        return resample_hwy<DSTTYPE, SRCTYPE>(dst, src, interpolate, roi,
+                                              nthreads);
+#endif
+
+    return resample_scalar<DSTTYPE, SRCTYPE>(dst, src, interpolate, roi,
+                                             nthreads);
+}
 
 
 bool
@@ -1155,27 +1423,7 @@ ImageBufAlgo::resample(ImageBuf& dst, const ImageBuf& src, bool interpolate,
         return false;
 
     if (dst.deep()) {
-        // If it's deep, figure out the sample allocations first, because
-        // it's not thread-safe to do that simultaneously with copying the
-        // values.
-        const ImageSpec& srcspec(src.spec());
-        const ImageSpec& dstspec(dst.spec());
-        float srcfx          = srcspec.full_x;
-        float srcfy          = srcspec.full_y;
-        float srcfw          = srcspec.full_width;
-        float srcfh          = srcspec.full_height;
-        float dstpixelwidth  = 1.0f / dstspec.full_width;
-        float dstpixelheight = 1.0f / dstspec.full_height;
-        ImageBuf::ConstIterator<float> srcpel(src, roi);
-        ImageBuf::Iterator<float> dstpel(dst, roi);
-        for (; !dstpel.done(); ++dstpel, ++srcpel) {
-            float s   = (dstpel.x() - dstspec.full_x + 0.5f) * dstpixelwidth;
-            float t   = (dstpel.y() - dstspec.full_y + 0.5f) * dstpixelheight;
-            int src_y = ifloor(srcfy + t * srcfh);
-            int src_x = ifloor(srcfx + s * srcfw);
-            srcpel.pos(src_x, src_y, 0);
-            dstpel.set_deep_samples(srcpel.deep_samples());
-        }
+        return resample_deep(dst, src, interpolate, roi, nthreads);
     }
 
     bool ok;
diff --git a/src/libOpenImageIO/imageio.cpp b/src/libOpenImageIO/imageio.cpp
index 909f8529d4..aa8babf9b4 100644
--- a/src/libOpenImageIO/imageio.cpp
+++ b/src/libOpenImageIO/imageio.cpp
@@ -53,6 +53,7 @@ int png_linear_premult(0);
 int tiff_half(0);
 int tiff_multithread(1);
 int dds_bc5normal(0);
+int enable_hwy(1);  // Enable Google Highway SIMD optimizations by default
 int limit_channels(1024);
 int limit_imagesize_MB(std::min(32 * 1024,
                                 int(Sysutil::physical_memory() >> 20)));
@@ -406,6 +407,10 @@ attribute(string_view name, TypeDesc type, const void* val)
         dds_bc5normal = *(const int*)val;
         return true;
     }
+    if (name == "enable_hwy" && type == TypeInt) {
+        enable_hwy = *(const int*)val;
+        return true;
+    }
     if (name == "limits:channels" && type == TypeInt) {
         limit_channels = *(const int*)val;
         return true;
@@ -612,6 +617,10 @@ getattribute(string_view name, TypeDesc type, void* val)
         *(int*)val = dds_bc5normal;
         return true;
     }
+    if (name == "enable_hwy" && type == TypeInt) {
+        *(int*)val = enable_hwy;
+        return true;
+    }
     if (name == "oiio:print_uncaught_errors" && type == TypeInt) {
         *(int*)val = oiio_print_uncaught_errors;
         return true;

From e9d6924decf94a7a528b32ef33c037b9dcd4ac23 Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 13 Jan 2026 10:47:55 +0900
Subject: [PATCH 02/70] Simplify CMake hwy option

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .claude/settings.local.json |  7 +++++++
 CMakeLists.txt              | 14 ++------------
 2 files changed, 9 insertions(+), 12 deletions(-)
 create mode 100644 .claude/settings.local.json

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 0000000000..136586c288
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,7 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(find:*)"
+    ]
+  }
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 729acdd316..1d60d4377c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -113,18 +113,8 @@ endif ()
 option (${PROJ_NAME}_BUILD_TOOLS "Build the command-line tools" ON)
 option (${PROJ_NAME}_BUILD_TESTS "Build the unit tests" ON)
 
-# Google Highway SIMD acceleration for selected ImageBufAlgo ops. This is an
-# optional optimization dependency: if enabled but not found, it will be
-# compiled out.
-#
-# Back-compat: honor -DUSE_HWY=OFF by mapping it to OIIO_USE_HWY if the latter
-# was not explicitly provided.
-if (DEFINED USE_HWY AND NOT DEFINED OIIO_USE_HWY)
-    set (OIIO_USE_HWY ${USE_HWY} CACHE BOOL
-         "Enable Google Highway SIMD optimizations (if Highway is available)" FORCE)
-else ()
-    option (OIIO_USE_HWY "Enable Google Highway SIMD optimizations (if Highway is available)" ON)
-endif ()
+# Google Highway SIMD acceleration for selected ImageBufAlgo ops. (optimization)
+option (OIIO_USE_HWY "Enable experimental Google Highway SIMD optimizations (if Highway is available)" OFF)
 
 set (OIIO_LIBNAME_SUFFIX "" CACHE STRING
      "Optional name appended to ${PROJECT_NAME} libraries that are built")

From 3704de58fefb3c717ba9f8c26bbe06f2b07fd81c Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 13 Jan 2026 10:51:07 +0900
Subject: [PATCH 03/70] Revert "Simplify CMake hwy option"

This reverts commit 4d3b1f3f1fec407a5811eae8be37b6f044c4ac80.

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .claude/settings.local.json |  7 -------
 CMakeLists.txt              | 14 ++++++++++++--
 2 files changed, 12 insertions(+), 9 deletions(-)
 delete mode 100644 .claude/settings.local.json

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
deleted file mode 100644
index 136586c288..0000000000
--- a/.claude/settings.local.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Bash(find:*)"
-    ]
-  }
-}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d60d4377c..729acdd316 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -113,8 +113,18 @@ endif ()
 option (${PROJ_NAME}_BUILD_TOOLS "Build the command-line tools" ON)
 option (${PROJ_NAME}_BUILD_TESTS "Build the unit tests" ON)
 
-# Google Highway SIMD acceleration for selected ImageBufAlgo ops. (optimization)
-option (OIIO_USE_HWY "Enable experimental Google Highway SIMD optimizations (if Highway is available)" OFF)
+# Google Highway SIMD acceleration for selected ImageBufAlgo ops. This is an
+# optional optimization dependency: if enabled but not found, it will be
+# compiled out.
+#
+# Back-compat: honor -DUSE_HWY=OFF by mapping it to OIIO_USE_HWY if the latter
+# was not explicitly provided.
+if (DEFINED USE_HWY AND NOT DEFINED OIIO_USE_HWY)
+    set (OIIO_USE_HWY ${USE_HWY} CACHE BOOL
+         "Enable Google Highway SIMD optimizations (if Highway is available)" FORCE)
+else ()
+    option (OIIO_USE_HWY "Enable Google Highway SIMD optimizations (if Highway is available)" ON)
+endif ()
 
 set (OIIO_LIBNAME_SUFFIX "" CACHE STRING
      "Optional name appended to ${PROJECT_NAME} libraries that are built")

From 9e8f530f6e44e58198967a0891aed4b4152ee598 Mon Sep 17 00:00:00 2001
From: Vlad Erium <shaamaan@gmail.com>
Date: Tue, 13 Jan 2026 10:56:09 +0900
Subject: [PATCH 04/70] Update CMakeLists.txt

Co-authored-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad Erium <shaamaan@gmail.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 CMakeLists.txt | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 729acdd316..62229859ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,20 +112,7 @@ else ()
 endif ()
 option (${PROJ_NAME}_BUILD_TOOLS "Build the command-line tools" ON)
 option (${PROJ_NAME}_BUILD_TESTS "Build the unit tests" ON)
-
-# Google Highway SIMD acceleration for selected ImageBufAlgo ops. This is an
-# optional optimization dependency: if enabled but not found, it will be
-# compiled out.
-#
-# Back-compat: honor -DUSE_HWY=OFF by mapping it to OIIO_USE_HWY if the latter
-# was not explicitly provided.
-if (DEFINED USE_HWY AND NOT DEFINED OIIO_USE_HWY)
-    set (OIIO_USE_HWY ${USE_HWY} CACHE BOOL
-         "Enable Google Highway SIMD optimizations (if Highway is available)" FORCE)
-else ()
-    option (OIIO_USE_HWY "Enable Google Highway SIMD optimizations (if Highway is available)" ON)
-endif ()
-
+option (OIIO_USE_HWY "Enable experimental Google Highway SIMD optimizations (if Highway is available)" OFF)
 set (OIIO_LIBNAME_SUFFIX "" CACHE STRING
      "Optional name appended to ${PROJECT_NAME} libraries that are built")
 option (BUILD_OIIOUTIL_ONLY "If ON, will build *only* libOpenImageIO_Util" OFF)

From 99ac995174d5efbc94c19e6684f3e8891af0ba59 Mon Sep 17 00:00:00 2001
From: Vlad Erium <shaamaan@gmail.com>
Date: Tue, 13 Jan 2026 10:56:37 +0900
Subject: [PATCH 05/70] Update src/cmake/externalpackages.cmake

Co-authored-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad Erium <shaamaan@gmail.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/cmake/externalpackages.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake
index 6d02f267fa..55874fdd90 100644
--- a/src/cmake/externalpackages.cmake
+++ b/src/cmake/externalpackages.cmake
@@ -226,7 +226,9 @@ endif ()
 
 
 # Google Highway for SIMD (optional optimization)
-checked_find_package (hwy ENABLE ${OIIO_USE_HWY})
+if (OIIO_USE_HWY)
+    checked_find_package (hwy)
+endif ()
 
 # Tessil/robin-map
 checked_find_package (Robinmap REQUIRED

From d52fe81986d940c4a41ff2e62dc4d4a122a03474 Mon Sep 17 00:00:00 2001
From: Vlad Erium <shaamaan@gmail.com>
Date: Tue, 13 Jan 2026 10:57:30 +0900
Subject: [PATCH 06/70] Update src/libOpenImageIO/imagebufalgo_addsub.cpp

Co-authored-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad Erium <shaamaan@gmail.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebufalgo_addsub.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index e8a25d86bf..20304f74be 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -67,7 +67,7 @@ add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
 
 
-#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+#if OIIO_USE_HWY
 // Native integer add using SaturatedAdd (scale-invariant, no float conversion)
 template<class T>
 static bool

From 1e15603f17a43c9569e1c3528b598b30e72251f4 Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 27 Jan 2026 10:59:12 +0900
Subject: [PATCH 07/70] Update platform.h

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/include/OpenImageIO/platform.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/include/OpenImageIO/platform.h b/src/include/OpenImageIO/platform.h
index 6ee6d107e8..a5b7b1f738 100644
--- a/src/include/OpenImageIO/platform.h
+++ b/src/include/OpenImageIO/platform.h
@@ -39,7 +39,6 @@
 #endif
 
 #ifdef _MSC_VER
-#    include <malloc.h>  // for alloca
 #    include <intrin.h>
 #endif
 

From f650c54f6077beebbf1b836bd9435bcea1d643df Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 27 Jan 2026 11:01:16 +0900
Subject: [PATCH 08/70] Update imagebufalgo_addsub.cpp

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebufalgo_addsub.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index 20304f74be..127cedd858 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -10,10 +10,6 @@
 #include <iostream>
 #include <limits>
 
-#if defined(_WIN32)
-#    include <malloc.h>  // for alloca
-#endif
-
 #include <OpenImageIO/half.h>
 
 #include <OpenImageIO/dassert.h>

From fe507c0d07129ce9b102ccaf26c628475b4109fe Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 27 Jan 2026 11:04:30 +0900
Subject: [PATCH 09/70] Update platform.h

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/include/OpenImageIO/platform.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/include/OpenImageIO/platform.h b/src/include/OpenImageIO/platform.h
index a5b7b1f738..aeba989947 100644
--- a/src/include/OpenImageIO/platform.h
+++ b/src/include/OpenImageIO/platform.h
@@ -312,8 +312,6 @@
 #endif
 #if defined(__GNUC__) || defined(__clang__)
 #    define OIIO_ALLOCA(type, size) (assert(size < (1<<20)), (size) != 0 ? ((type*)__builtin_alloca((size) * sizeof(type))) : nullptr)
-#elif defined(_MSC_VER)
-#    define OIIO_ALLOCA(type, size) (assert(size < (1<<20)), (size) != 0 ? ((type*)_alloca((size) * sizeof(type))) : nullptr)
 #else
 #    define OIIO_ALLOCA(type, size) (assert(size < (1<<20)), (size) != 0 ? ((type*)alloca((size) * sizeof(type))) : nullptr)
 #endif

From 82d073536184d6bbdf312a0d0a516f33e79087ed Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Sun, 1 Feb 2026 19:21:31 +0900
Subject: [PATCH 10/70] Refactor HWY per-pixel ops and add strided ROI fallback
 tests

Generic per-pixel HWY operation helpers for binary and ternary ops, refactors add/sub/mul/div/mad HWY implementations to use these helpers, and ensures HWY SIMD is only used for contiguous channel ranges. Adds a new test to verify correct fallback to scalar code for strided (non-contiguous) ROI channel ranges.

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/include/imageio_pvt.h                  |   1 +
 src/libOpenImageIO/imagebufalgo_addsub.cpp | 242 +++++----------------
 src/libOpenImageIO/imagebufalgo_hwy_pvt.h  |  85 ++++++++
 src/libOpenImageIO/imagebufalgo_mad.cpp    |  66 ++----
 src/libOpenImageIO/imagebufalgo_muldiv.cpp | 119 +++-------
 src/libOpenImageIO/imagebufalgo_test.cpp   |  71 ++++++
 6 files changed, 259 insertions(+), 325 deletions(-)

diff --git a/src/include/imageio_pvt.h b/src/include/imageio_pvt.h
index 273375cd77..13d5c06140 100644
--- a/src/include/imageio_pvt.h
+++ b/src/include/imageio_pvt.h
@@ -43,6 +43,7 @@ extern int oiio_log_times;
 extern int openexr_core;
 extern int jpeg_com_attributes;
 extern int png_linear_premult;
+extern int enable_hwy;
 extern int limit_channels;
 extern int limit_imagesize_MB;
 extern int imagebuf_print_uncaught_errors;
diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index 127cedd858..6f1b679213 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -64,60 +64,17 @@ add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
 
 
 #if OIIO_USE_HWY
+
 // Native integer add using SaturatedAdd (scale-invariant, no float conversion)
 template<class T>
 static bool
 add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
                         ROI roi, int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
-                            && ChannelsContiguous<T>(Av, nchannels)
-                            && ChannelsContiguous<T>(Bv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
-            const T* a_row = RoiRowPtr<T>(Av, y, roi);
-            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
-
-            if (contig) {
-                // Native integer saturated add - much faster than float conversion!
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
-                                         [](auto d, auto a, auto b) {
-                                             return hn::SaturatedAdd(a, b);
-                                         });
-            } else {
-                // Scalar fallback
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
-                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
-                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
-                    for (int c = 0; c < nchannels; ++c) {
-                        // Saturating add in scalar
-                        int64_t sum = (int64_t)a_ptr[c] + (int64_t)b_ptr[c];
-                        if constexpr (std::is_unsigned_v<T>) {
-                            r_ptr[c] = (sum > std::numeric_limits<T>::max())
-                                           ? std::numeric_limits<T>::max()
-                                           : (T)sum;
-                        } else {
-                            r_ptr[c] = (sum > std::numeric_limits<T>::max())
-                                           ? std::numeric_limits<T>::max()
-                                       : (sum < std::numeric_limits<T>::min())
-                                           ? std::numeric_limits<T>::min()
-                                           : (T)sum;
-                        }
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_binary_native_int_perpixel_op<T>(R, A, B, roi, nthreads,
+                                               [](auto /*d*/, auto a, auto b) {
+                                                   return hn::SaturatedAdd(a, b);
+                                               });
 }
 
 template<class Rtype, class Atype, class Btype>
@@ -125,46 +82,10 @@ static bool
 add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
-                            && ChannelsContiguous<Atype>(Av, nchannels)
-                            && ChannelsContiguous<Btype>(Bv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
-            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
-            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
-
-            if (contig) {
-                // Process whole line as one vector stream
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
-                                               [](auto d, auto a, auto b) {
-                                                   return hn::Add(a, b);
-                                               });
-            } else {
-                // Process pixel by pixel (scalar fallback for strided channels)
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
-                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
-                                                           roi.chbegin);
-                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
-                                                           roi.chbegin);
-                    for (int c = 0; c < nchannels; ++c) {
-                        r_ptr[c] = static_cast<Rtype>(
-                            static_cast<float>(a_ptr[c])
-                            + static_cast<float>(b_ptr[c]));
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
+                                                      [](auto /*d*/, auto a, auto b) {
+                                                          return hn::Add(a, b);
+                                                      });
 }
 
 template<class Rtype, class Atype>
@@ -204,15 +125,24 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
         && B.localpixels()) {
-        // Use native integer path for scale-invariant add when all types match
-        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
-        constexpr bool all_same = std::is_same_v<Rtype, Atype>
-                                  && std::is_same_v<Atype, Btype>;
-        constexpr bool is_integer = std::is_integral_v<Rtype>;
-        if constexpr (all_same && is_integer) {
-            return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        auto Rv = HwyPixels(R);
+        auto Av = HwyPixels(A);
+        auto Bv = HwyPixels(B);
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+        if (contig) {
+            // Use native integer path for scale-invariant add when all types
+            // match and are integer types (much faster: 6-12x vs 3-5x with
+            // float conversion).
+            constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                      && std::is_same_v<Atype, Btype>;
+            constexpr bool is_integer = std::is_integral_v<Rtype>;
+            if constexpr (all_same && is_integer)
+                return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+            return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
         }
-        return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
     }
 #endif
     return add_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
@@ -236,55 +166,10 @@ static bool
 sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
                         ROI roi, int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
-                            && ChannelsContiguous<T>(Av, nchannels)
-                            && ChannelsContiguous<T>(Bv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
-            const T* a_row = RoiRowPtr<T>(Av, y, roi);
-            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
-
-            if (contig) {
-                // Native integer saturated sub - much faster than float conversion!
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
-                                         [](auto d, auto a, auto b) {
-                                             return hn::SaturatedSub(a, b);
-                                         });
-            } else {
-                // Scalar fallback
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
-                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
-                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
-                    for (int c = 0; c < nchannels; ++c) {
-                        // Saturating sub in scalar
-                        if constexpr (std::is_unsigned_v<T>) {
-                            r_ptr[c] = (a_ptr[c] > b_ptr[c])
-                                           ? (a_ptr[c] - b_ptr[c])
-                                           : T(0);
-                        } else {
-                            int64_t diff = (int64_t)a_ptr[c]
-                                           - (int64_t)b_ptr[c];
-                            r_ptr[c] = (diff > std::numeric_limits<T>::max())
-                                           ? std::numeric_limits<T>::max()
-                                       : (diff < std::numeric_limits<T>::min())
-                                           ? std::numeric_limits<T>::min()
-                                           : (T)diff;
-                        }
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_binary_native_int_perpixel_op<T>(R, A, B, roi, nthreads,
+                                               [](auto /*d*/, auto a, auto b) {
+                                                   return hn::SaturatedSub(a, b);
+                                               });
 }
 
 template<class Rtype, class Atype, class Btype>
@@ -292,44 +177,10 @@ static bool
 sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
-                            && ChannelsContiguous<Atype>(Av, nchannels)
-                            && ChannelsContiguous<Btype>(Bv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
-            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
-            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
-
-            if (contig) {
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
-                                               [](auto d, auto a, auto b) {
-                                                   return hn::Sub(a, b);
-                                               });
-            } else {
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
-                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
-                                                           roi.chbegin);
-                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
-                                                           roi.chbegin);
-                    for (int c = 0; c < nchannels; ++c) {
-                        r_ptr[c] = static_cast<Rtype>(
-                            static_cast<float>(a_ptr[c])
-                            - static_cast<float>(b_ptr[c]));
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
+                                                      [](auto /*d*/, auto a, auto b) {
+                                                          return hn::Sub(a, b);
+                                                      });
 }
 #endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
 
@@ -341,15 +192,24 @@ sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
         && B.localpixels()) {
-        // Use native integer path for scale-invariant sub when all types match
-        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
-        constexpr bool all_same = std::is_same_v<Rtype, Atype>
-                                  && std::is_same_v<Atype, Btype>;
-        constexpr bool is_integer = std::is_integral_v<Rtype>;
-        if constexpr (all_same && is_integer) {
-            return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        auto Rv = HwyPixels(R);
+        auto Av = HwyPixels(A);
+        auto Bv = HwyPixels(B);
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+        if (contig) {
+            // Use native integer path for scale-invariant sub when all types
+            // match and are integer types (much faster: 6-12x vs 3-5x with
+            // float conversion).
+            constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                      && std::is_same_v<Atype, Btype>;
+            constexpr bool is_integer = std::is_integral_v<Rtype>;
+            if constexpr (all_same && is_integer)
+                return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+            return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
         }
-        return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
     }
 #endif
     return sub_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
diff --git a/src/libOpenImageIO/imagebufalgo_hwy_pvt.h b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
index fe4c9b0d8a..32069b2882 100644
--- a/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
+++ b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
@@ -6,6 +6,7 @@
 
 #include <OpenImageIO/half.h>
 #include <OpenImageIO/imagebuf.h>
+#include <OpenImageIO/imagebufalgo_util.h>
 #include <OpenImageIO/imageio.h>
 #include <algorithm>
 #include <cstddef>
@@ -756,6 +757,90 @@ RunHwyTernaryCmd(Rtype* r, const ABCtype* a, const ABCtype* b, const ABCtype* c,
     }
 }
 
+// -----------------------------------------------------------------------
+// Per-pixel Ops (ImageBufAlgo, contiguous interleaved channels)
+// -----------------------------------------------------------------------
+
+/// Execute a binary per-pixel HWY operation for interleaved, contiguous
+/// channels. The caller is responsible for ensuring that the channel range is
+/// contiguous for R/A/B (i.e. no per-pixel padding, and the ROI channel range
+/// covers the full pixel).
+template<typename Rtype, typename Atype, typename Btype, typename OpFunc>
+inline bool
+hwy_binary_perpixel_op(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                       int nthreads, OpFunc op)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const size_t n = static_cast<size_t>(roi.width())
+                         * static_cast<size_t>(nchannels);
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+            RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n, op);
+        }
+    });
+    return true;
+}
+
+/// Execute a ternary per-pixel HWY operation for interleaved, contiguous
+/// channels. The caller is responsible for ensuring that the channel range is
+/// contiguous for R/A/B/C (i.e. no per-pixel padding, and the ROI channel range
+/// covers the full pixel).
+template<typename Rtype, typename ABCtype, typename OpFunc>
+inline bool
+hwy_ternary_perpixel_op(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                        const ImageBuf& C, ROI roi, int nthreads, OpFunc op)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    auto Cv = HwyPixels(C);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const size_t n = static_cast<size_t>(roi.width())
+                         * static_cast<size_t>(nchannels);
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi);
+            const ABCtype* a_row = RoiRowPtr<ABCtype>(Av, y, roi);
+            const ABCtype* b_row = RoiRowPtr<ABCtype>(Bv, y, roi);
+            const ABCtype* c_row = RoiRowPtr<ABCtype>(Cv, y, roi);
+            RunHwyTernaryCmd<Rtype, ABCtype>(r_row, a_row, b_row, c_row, n, op);
+        }
+    });
+    return true;
+}
+
+/// Execute a binary per-pixel HWY operation on native integer arrays (no type
+/// promotion/normalization). The caller is responsible for ensuring that the
+/// channel range is contiguous for R/A/B.
+template<typename T, typename OpFunc>
+inline bool
+hwy_binary_native_int_perpixel_op(ImageBuf& R, const ImageBuf& A,
+                                  const ImageBuf& B, ROI roi, int nthreads,
+                                  OpFunc op)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const size_t n = static_cast<size_t>(roi.width())
+                         * static_cast<size_t>(nchannels);
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+            RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n, op);
+        }
+    });
+    return true;
+}
+
 // -----------------------------------------------------------------------
 // Interleaved Channel Load/Store Helpers
 // -----------------------------------------------------------------------
diff --git a/src/libOpenImageIO/imagebufalgo_mad.cpp b/src/libOpenImageIO/imagebufalgo_mad.cpp
index d8038a74a7..04955471e0 100644
--- a/src/libOpenImageIO/imagebufalgo_mad.cpp
+++ b/src/libOpenImageIO/imagebufalgo_mad.cpp
@@ -52,52 +52,11 @@ static bool
 mad_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
              const ImageBuf& C, ROI roi, int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    auto Cv = HwyPixels(C);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
-                            && ChannelsContiguous<ABCtype>(Av, nchannels)
-                            && ChannelsContiguous<ABCtype>(Bv, nchannels)
-                            && ChannelsContiguous<ABCtype>(Cv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi);
-            const ABCtype* a_row = RoiRowPtr<ABCtype>(Av, y, roi);
-            const ABCtype* b_row = RoiRowPtr<ABCtype>(Bv, y, roi);
-            const ABCtype* c_row = RoiRowPtr<ABCtype>(Cv, y, roi);
-
-            if (contig) {
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                // Use Highway SIMD for a*b+c (fused multiply-add)
-                RunHwyTernaryCmd<Rtype, ABCtype>(r_row, a_row, b_row, c_row, n,
-                                                 [](auto d, auto a, auto b,
-                                                    auto c) {
-                                                     return hn::MulAdd(a, b, c);
-                                                 });
-            } else {
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
-                    const ABCtype* a_ptr = ChannelPtr<ABCtype>(Av, x, y,
-                                                               roi.chbegin);
-                    const ABCtype* b_ptr = ChannelPtr<ABCtype>(Bv, x, y,
-                                                               roi.chbegin);
-                    const ABCtype* c_ptr = ChannelPtr<ABCtype>(Cv, x, y,
-                                                               roi.chbegin);
-                    for (int ch = 0; ch < nchannels; ++ch) {
-                        r_ptr[ch] = static_cast<Rtype>(
-                            static_cast<float>(a_ptr[ch])
-                                * static_cast<float>(b_ptr[ch])
-                            + static_cast<float>(c_ptr[ch]));
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_ternary_perpixel_op<Rtype, ABCtype>(R, A, B, C, roi, nthreads,
+                                                   [](auto /*d*/, auto a, auto b,
+                                                      auto c) {
+                                                       return hn::MulAdd(a, b, c);
+                                                   });
 }
 #endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
 
@@ -108,8 +67,19 @@ mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C,
 {
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
-        && B.localpixels() && C.localpixels())
-        return mad_impl_hwy<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
+        && B.localpixels() && C.localpixels()) {
+        auto Rv = HwyPixels(R);
+        auto Av = HwyPixels(A);
+        auto Bv = HwyPixels(B);
+        auto Cv = HwyPixels(C);
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<ABCtype>(Av, nchannels)
+                            && ChannelsContiguous<ABCtype>(Bv, nchannels)
+                            && ChannelsContiguous<ABCtype>(Cv, nchannels);
+        if (contig)
+            return mad_impl_hwy<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
+    }
 #endif
     return mad_impl_scalar<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
 }
diff --git a/src/libOpenImageIO/imagebufalgo_muldiv.cpp b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
index 45c166907a..bb27a05af5 100644
--- a/src/libOpenImageIO/imagebufalgo_muldiv.cpp
+++ b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
@@ -131,44 +131,10 @@ static bool
 mul_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
-                            && ChannelsContiguous<Atype>(Av, nchannels)
-                            && ChannelsContiguous<Btype>(Bv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
-            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
-            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
-
-            if (contig) {
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
-                                               [](auto d, auto a, auto b) {
-                                                   return hn::Mul(a, b);
-                                               });
-            } else {
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
-                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
-                                                           roi.chbegin);
-                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
-                                                           roi.chbegin);
-                    for (int c = 0; c < nchannels; ++c) {
-                        r_ptr[c] = static_cast<Rtype>(
-                            static_cast<float>(a_ptr[c])
-                            * static_cast<float>(b_ptr[c]));
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
+                                                      [](auto /*d*/, auto a, auto b) {
+                                                          return hn::Mul(a, b);
+                                                      });
 }
 
 template<class Rtype, class Atype>
@@ -190,7 +156,6 @@ mul_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
                     r_row + xoff * Rv.pixel_bytes);
                 const Atype* a_ptr = reinterpret_cast<const Atype*>(
                     a_row + xoff * Av.pixel_bytes);
-
                 for (int c = roi.chbegin; c < roi.chend; ++c) {
                     r_ptr[c] = (Rtype)((SimdType)a_ptr[c] * (SimdType)b[c]);
                 }
@@ -208,8 +173,17 @@ mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 {
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
-        && B.localpixels())
-        return mul_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+        && B.localpixels()) {
+        auto Rv = HwyPixels(R);
+        auto Av = HwyPixels(A);
+        auto Bv = HwyPixels(B);
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+        if (contig)
+            return mul_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
 #endif
     return mul_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
 }
@@ -330,49 +304,13 @@ static bool
 div_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
-    auto Rv = HwyPixels(R);
-    auto Av = HwyPixels(A);
-    auto Bv = HwyPixels(B);
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const int nchannels = RoiNChannels(roi);
-        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
-                            && ChannelsContiguous<Atype>(Av, nchannels)
-                            && ChannelsContiguous<Btype>(Bv, nchannels);
-
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
-            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
-            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
-
-            if (contig) {
-                size_t n = static_cast<size_t>(roi.width())
-                           * static_cast<size_t>(nchannels);
-                RunHwyCmd<Rtype, Atype, Btype>(
-                    r_row, a_row, b_row, n, [](auto d, auto a, auto b) {
-                        // Check for zero division: if b == 0, return 0
-                        auto zero = hn::Zero(d);
-                        auto mask = hn::Eq(b, zero);
-                        return hn::IfThenElse(mask, zero, hn::Div(a, b));
-                    });
-            } else {
-                for (int x = roi.xbegin; x < roi.xend; ++x) {
-                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
-                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
-                                                           roi.chbegin);
-                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
-                                                           roi.chbegin);
-                    for (int c = 0; c < nchannels; ++c) {
-                        float v  = static_cast<float>(b_ptr[c]);
-                        r_ptr[c] = (v == 0.0f)
-                                       ? static_cast<Rtype>(0.0f)
-                                       : static_cast<Rtype>(
-                                             static_cast<float>(a_ptr[c]) / v);
-                    }
-                }
-            }
-        }
-    });
-    return true;
+    return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
+                                                      [](auto d, auto a, auto b) {
+                                                          auto zero = hn::Zero(d);
+                                                          auto mask = hn::Eq(b, zero);
+                                                          return hn::IfThenElse(mask, zero,
+                                                                                hn::Div(a, b));
+                                                      });
 }
 #endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
 
@@ -383,8 +321,17 @@ div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 {
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
-        && B.localpixels())
-        return div_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+        && B.localpixels()) {
+        auto Rv = HwyPixels(R);
+        auto Av = HwyPixels(A);
+        auto Bv = HwyPixels(B);
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+        if (contig)
+            return div_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
 #endif
     return div_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
 }
diff --git a/src/libOpenImageIO/imagebufalgo_test.cpp b/src/libOpenImageIO/imagebufalgo_test.cpp
index 940e2a8ff5..03900983c8 100644
--- a/src/libOpenImageIO/imagebufalgo_test.cpp
+++ b/src/libOpenImageIO/imagebufalgo_test.cpp
@@ -511,6 +511,76 @@ test_mad()
 
 
+void
+test_hwy_strided_roi_fallback()
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    std::cout << "test hwy strided roi fallback\n";
+
+    int prev_enable_hwy = 0;
+    OIIO::getattribute("enable_hwy", prev_enable_hwy);
+
+    ImageSpec spec(64, 64, 4, TypeDesc::UINT8);
+    ImageBuf A(spec), B(spec), C(spec);
+    ImageBufAlgo::fill(A, { 0.2f, 0.4f, 0.6f, 0.8f });
+    ImageBufAlgo::fill(B, { 0.1f, 0.3f, 0.5f, 0.7f });
+    ImageBufAlgo::fill(C, { 0.05f, 0.05f, 0.05f, 0.05f });
+
+    ROI roi = get_roi(A.spec());
+    roi.chbegin = 0;
+    roi.chend   = 3;  // RGB only => non-contiguous for RGBA interleaving
+
+    {
+        ImageBuf R0(spec), R1(spec);
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::add(R0, A, B, roi);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::add(R1, A, B, roi);
+        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f, roi);
+        OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
+    }
+    {
+        ImageBuf R0(spec), R1(spec);
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::sub(R0, A, B, roi);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::sub(R1, A, B, roi);
+        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f, roi);
+        OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
+    }
+    {
+        ImageBuf R0(spec), R1(spec);
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::mul(R0, A, B, roi);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::mul(R1, A, B, roi);
+        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f, roi);
+        OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
+    }
+    {
+        ImageBuf R0(spec), R1(spec);
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::div(R0, A, B, roi);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::div(R1, A, B, roi);
+        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f, roi);
+        OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
+    }
+    {
+        ImageBuf R0(spec), R1(spec);
+        OIIO::attribute("enable_hwy", 0);
+        ImageBufAlgo::mad(R0, A, B, C, roi);
+        OIIO::attribute("enable_hwy", 1);
+        ImageBufAlgo::mad(R1, A, B, C, roi);
+        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f, roi);
+        OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
+    }
+
+    OIIO::attribute("enable_hwy", prev_enable_hwy);
+#endif
+}
+
+
 // Tests ImageBufAlgo::min
 void
 test_min()
@@ -1576,6 +1646,7 @@ main(int argc, char** argv)
     test_sub();
     test_mul();
     test_mad();
+    test_hwy_strided_roi_fallback();
     test_min();
     test_max();
     test_over(TypeFloat);

From b10eb4d35e0260b50d782546b29f8115931ef96b Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Mon, 23 Feb 2026 13:59:18 +0900
Subject: [PATCH 11/70] Optimize SIMD for RGB ROI on RGBA images

Add specialized HWY fast-paths for add/sub/mul/div/mad to handle the common case where the ROI selects RGB channels of 4-channel (RGBA) images by processing full 4-channel interleaved data and preserving alpha bitwise. Introduce small op lambdas for each operator and handle float/half/double same-type cases with contiguous-channel checks, half-promote/demote paths, and division zero-safety. Also update tests to pre-fill destination buffers and compare results (removed ROI from compare) to validate the strided-ROI fallback behavior. Affects imagebufalgo_addsub.cpp, imagebufalgo_mad.cpp, imagebufalgo_muldiv.cpp and imagebufalgo_test.cpp.

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebufalgo_addsub.cpp | 294 ++++++++++++++++++-
 src/libOpenImageIO/imagebufalgo_mad.cpp    | 169 ++++++++++-
 src/libOpenImageIO/imagebufalgo_muldiv.cpp | 314 ++++++++++++++++++++-
 src/libOpenImageIO/imagebufalgo_test.cpp   |  20 +-
 4 files changed, 773 insertions(+), 24 deletions(-)

diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index 6f1b679213..bea1d8259b 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -82,10 +82,132 @@ static bool
 add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
+    auto op = [](auto /*d*/, auto a, auto b) {
+        return hn::Add(a, b);
+    };
+
+    // Special-case: RGBA images but ROI is RGB (strided channel subset). We
+    // still can SIMD the RGB channels by processing full RGBA and preserving
+    // alpha exactly (bitwise) from the destination.
+    if (roi.chbegin == 0 && roi.chend == 3) {
+        // Only support same-type float/half/double in this fast path.
+        constexpr bool floaty = (std::is_same_v<Rtype, float>
+                                 || std::is_same_v<Rtype, double>
+                                 || std::is_same_v<Rtype, half>)
+                                && std::is_same_v<Rtype, Atype>
+                                && std::is_same_v<Rtype, Btype>;
+        if constexpr (floaty) {
+            auto Rv = HwyPixels(R);
+            auto Av = HwyPixels(A);
+            auto Bv = HwyPixels(B);
+            if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
+                && ChannelsContiguous<Rtype>(Rv, 4)
+                && ChannelsContiguous<Atype>(Av, 4)
+                && ChannelsContiguous<Btype>(Bv, 4)) {
+                ROI roi4     = roi;
+                roi4.chbegin = 0;
+                roi4.chend   = 4;
+                using MathT  = typename SimdMathType<Rtype>::type;
+                const hn::ScalableTag<MathT> d;
+                const size_t lanes = hn::Lanes(d);
+                ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
+                    for (int y = roi4.ybegin; y < roi4.yend; ++y) {
+                        Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi4);
+                        const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
+                        const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
+                        const size_t npixels = static_cast<size_t>(roi4.width());
+
+                        size_t x = 0;
+                        for (; x + lanes <= npixels; x += lanes) {
+                            const size_t off = x * 4;
+                            if constexpr (std::is_same_v<Rtype, half>) {
+                                using T16  = hwy::float16_t;
+                                auto d16   = hn::Rebind<T16, decltype(d)>();
+                                const T16* a16
+                                    = reinterpret_cast<const T16*>(a_row + off);
+                                const T16* b16
+                                    = reinterpret_cast<const T16*>(b_row + off);
+                                T16* r16 = reinterpret_cast<T16*>(r_row + off);
+
+                                hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
+                                hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
+                                hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
+                                hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
+                                                     aa16);
+                                hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
+                                                     ba16);
+                                hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
+                                                     da16);
+                                (void)aa16;
+                                (void)ba16;
+                                (void)dr16;
+                                (void)dg16;
+                                (void)db16;
+
+                                auto rr = op(d, hn::PromoteTo(d, ar16),
+                                             hn::PromoteTo(d, br16));
+                                auto rg = op(d, hn::PromoteTo(d, ag16),
+                                             hn::PromoteTo(d, bg16));
+                                auto rb = op(d, hn::PromoteTo(d, ab16),
+                                             hn::PromoteTo(d, bb16));
+
+                                auto rr16 = hn::DemoteTo(d16, rr);
+                                auto rg16 = hn::DemoteTo(d16, rg);
+                                auto rb16 = hn::DemoteTo(d16, rb);
+                                hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
+                                                      r16);
+                            } else {
+                                hn::Vec<decltype(d)> ar, ag, ab, aa;
+                                hn::Vec<decltype(d)> br, bg, bb, ba;
+                                hn::Vec<decltype(d)> dr, dg, db, da;
+                                hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
+                                                     aa);
+                                hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
+                                                     ba);
+                                hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
+                                                     da);
+                                (void)aa;
+                                (void)ba;
+                                (void)dr;
+                                (void)dg;
+                                (void)db;
+
+                                auto rr = op(d, ar, br);
+                                auto rg = op(d, ag, bg);
+                                auto rb = op(d, ab, bb);
+                                hn::StoreInterleaved4(rr, rg, rb, da, d,
+                                                      r_row + off);
+                            }
+                        }
+
+                        for (; x < npixels; ++x) {
+                            const size_t off = x * 4;
+                            if constexpr (std::is_same_v<Rtype, half>) {
+                                r_row[off + 0]
+                                    = half((float)a_row[off + 0]
+                                           + (float)b_row[off + 0]);
+                                r_row[off + 1]
+                                    = half((float)a_row[off + 1]
+                                           + (float)b_row[off + 1]);
+                                r_row[off + 2]
+                                    = half((float)a_row[off + 2]
+                                           + (float)b_row[off + 2]);
+                            } else {
+                                r_row[off + 0] = a_row[off + 0] + b_row[off + 0];
+                                r_row[off + 1] = a_row[off + 1] + b_row[off + 1];
+                                r_row[off + 2] = a_row[off + 2] + b_row[off + 2];
+                            }
+                            // Preserve alpha (off+3).
+                        }
+                    }
+                });
+                return true;
+            }
+        }
+    }
+
     return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
-                                                      [](auto /*d*/, auto a, auto b) {
-                                                          return hn::Add(a, b);
-                                                      });
+                                                       op);
 }
 
 template<class Rtype, class Atype>
@@ -143,6 +265,25 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
                 return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
             return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
         }
+
+        // Handle the common RGBA + RGB ROI strided case (preserving alpha).
+        constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
+                                         || std::is_same_v<Rtype, double>
+                                         || std::is_same_v<Rtype, half>)
+                                        && std::is_same_v<Rtype, Atype>
+                                        && std::is_same_v<Rtype, Btype>;
+        if constexpr (floaty_strided) {
+            if (roi.chbegin == 0 && roi.chend == 3) {
+                const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
+                                      && Bv.nchannels >= 4)
+                                     && ChannelsContiguous<Rtype>(Rv, 4)
+                                     && ChannelsContiguous<Atype>(Av, 4)
+                                     && ChannelsContiguous<Btype>(Bv, 4);
+                if (contig4)
+                    return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
+                                                             nthreads);
+            }
+        }
     }
 #endif
     return add_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
@@ -177,10 +318,132 @@ static bool
 sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
+    auto op = [](auto /*d*/, auto a, auto b) {
+        return hn::Sub(a, b);
+    };
+
+    // Special-case: RGBA images but ROI is RGB (strided channel subset). We
+    // still can SIMD the RGB channels by processing full RGBA and preserving
+    // alpha exactly (bitwise) from the destination.
+    if (roi.chbegin == 0 && roi.chend == 3) {
+        // Only support same-type float/half/double in this fast path.
+        constexpr bool floaty = (std::is_same_v<Rtype, float>
+                                 || std::is_same_v<Rtype, double>
+                                 || std::is_same_v<Rtype, half>)
+                                && std::is_same_v<Rtype, Atype>
+                                && std::is_same_v<Rtype, Btype>;
+        if constexpr (floaty) {
+            auto Rv = HwyPixels(R);
+            auto Av = HwyPixels(A);
+            auto Bv = HwyPixels(B);
+            if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
+                && ChannelsContiguous<Rtype>(Rv, 4)
+                && ChannelsContiguous<Atype>(Av, 4)
+                && ChannelsContiguous<Btype>(Bv, 4)) {
+                ROI roi4     = roi;
+                roi4.chbegin = 0;
+                roi4.chend   = 4;
+                using MathT  = typename SimdMathType<Rtype>::type;
+                const hn::ScalableTag<MathT> d;
+                const size_t lanes = hn::Lanes(d);
+                ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
+                    for (int y = roi4.ybegin; y < roi4.yend; ++y) {
+                        Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi4);
+                        const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
+                        const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
+                        const size_t npixels = static_cast<size_t>(roi4.width());
+
+                        size_t x = 0;
+                        for (; x + lanes <= npixels; x += lanes) {
+                            const size_t off = x * 4;
+                            if constexpr (std::is_same_v<Rtype, half>) {
+                                using T16  = hwy::float16_t;
+                                auto d16   = hn::Rebind<T16, decltype(d)>();
+                                const T16* a16
+                                    = reinterpret_cast<const T16*>(a_row + off);
+                                const T16* b16
+                                    = reinterpret_cast<const T16*>(b_row + off);
+                                T16* r16 = reinterpret_cast<T16*>(r_row + off);
+
+                                hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
+                                hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
+                                hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
+                                hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
+                                                     aa16);
+                                hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
+                                                     ba16);
+                                hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
+                                                     da16);
+                                (void)aa16;
+                                (void)ba16;
+                                (void)dr16;
+                                (void)dg16;
+                                (void)db16;
+
+                                auto rr = op(d, hn::PromoteTo(d, ar16),
+                                             hn::PromoteTo(d, br16));
+                                auto rg = op(d, hn::PromoteTo(d, ag16),
+                                             hn::PromoteTo(d, bg16));
+                                auto rb = op(d, hn::PromoteTo(d, ab16),
+                                             hn::PromoteTo(d, bb16));
+
+                                auto rr16 = hn::DemoteTo(d16, rr);
+                                auto rg16 = hn::DemoteTo(d16, rg);
+                                auto rb16 = hn::DemoteTo(d16, rb);
+                                hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
+                                                      r16);
+                            } else {
+                                hn::Vec<decltype(d)> ar, ag, ab, aa;
+                                hn::Vec<decltype(d)> br, bg, bb, ba;
+                                hn::Vec<decltype(d)> dr, dg, db, da;
+                                hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
+                                                     aa);
+                                hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
+                                                     ba);
+                                hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
+                                                     da);
+                                (void)aa;
+                                (void)ba;
+                                (void)dr;
+                                (void)dg;
+                                (void)db;
+
+                                auto rr = op(d, ar, br);
+                                auto rg = op(d, ag, bg);
+                                auto rb = op(d, ab, bb);
+                                hn::StoreInterleaved4(rr, rg, rb, da, d,
+                                                      r_row + off);
+                            }
+                        }
+
+                        for (; x < npixels; ++x) {
+                            const size_t off = x * 4;
+                            if constexpr (std::is_same_v<Rtype, half>) {
+                                r_row[off + 0]
+                                    = half((float)a_row[off + 0]
+                                           - (float)b_row[off + 0]);
+                                r_row[off + 1]
+                                    = half((float)a_row[off + 1]
+                                           - (float)b_row[off + 1]);
+                                r_row[off + 2]
+                                    = half((float)a_row[off + 2]
+                                           - (float)b_row[off + 2]);
+                            } else {
+                                r_row[off + 0] = a_row[off + 0] - b_row[off + 0];
+                                r_row[off + 1] = a_row[off + 1] - b_row[off + 1];
+                                r_row[off + 2] = a_row[off + 2] - b_row[off + 2];
+                            }
+                            // Preserve alpha (off+3).
+                        }
+                    }
+                });
+                return true;
+            }
+        }
+    }
+
     return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
-                                                      [](auto /*d*/, auto a, auto b) {
-                                                          return hn::Sub(a, b);
-                                                      });
+                                                       op);
 }
 #endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
 
@@ -210,6 +473,25 @@ sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
                 return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
             return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
         }
+
+        // Handle the common RGBA + RGB ROI strided case (preserving alpha).
+        constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
+                                         || std::is_same_v<Rtype, double>
+                                         || std::is_same_v<Rtype, half>)
+                                        && std::is_same_v<Rtype, Atype>
+                                        && std::is_same_v<Rtype, Btype>;
+        if constexpr (floaty_strided) {
+            if (roi.chbegin == 0 && roi.chend == 3) {
+                const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
+                                      && Bv.nchannels >= 4)
+                                     && ChannelsContiguous<Rtype>(Rv, 4)
+                                     && ChannelsContiguous<Atype>(Av, 4)
+                                     && ChannelsContiguous<Btype>(Bv, 4);
+                if (contig4)
+                    return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
+                                                             nthreads);
+            }
+        }
     }
 #endif
     return sub_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
diff --git a/src/libOpenImageIO/imagebufalgo_mad.cpp b/src/libOpenImageIO/imagebufalgo_mad.cpp
index 04955471e0..f8f3e19ddf 100644
--- a/src/libOpenImageIO/imagebufalgo_mad.cpp
+++ b/src/libOpenImageIO/imagebufalgo_mad.cpp
@@ -52,11 +52,153 @@ static bool
 mad_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
              const ImageBuf& C, ROI roi, int nthreads)
 {
+    auto op = [](auto /*d*/, auto a, auto b, auto c) {
+        return hn::MulAdd(a, b, c);
+    };
+
+    // Special-case: RGBA images but ROI is RGB (strided channel subset). We
+    // still can SIMD the RGB channels by processing full RGBA and preserving
+    // alpha exactly (bitwise) from the destination.
+    if (roi.chbegin == 0 && roi.chend == 3) {
+        // Only support same-type float/half/double in this fast path.
+        constexpr bool floaty = (std::is_same_v<Rtype, float>
+                                 || std::is_same_v<Rtype, double>
+                                 || std::is_same_v<Rtype, half>)
+                                && std::is_same_v<Rtype, ABCtype>;
+        if constexpr (floaty) {
+            auto Rv = HwyPixels(R);
+            auto Av = HwyPixels(A);
+            auto Bv = HwyPixels(B);
+            auto Cv = HwyPixels(C);
+            if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
+                && Cv.nchannels >= 4 && ChannelsContiguous<Rtype>(Rv, 4)
+                && ChannelsContiguous<ABCtype>(Av, 4)
+                && ChannelsContiguous<ABCtype>(Bv, 4)
+                && ChannelsContiguous<ABCtype>(Cv, 4)) {
+                ROI roi4     = roi;
+                roi4.chbegin = 0;
+                roi4.chend   = 4;
+                using MathT  = typename SimdMathType<Rtype>::type;
+                const hn::ScalableTag<MathT> d;
+                const size_t lanes = hn::Lanes(d);
+                ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
+                    for (int y = roi4.ybegin; y < roi4.yend; ++y) {
+                        Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi4);
+                        const ABCtype* a_row = RoiRowPtr<ABCtype>(Av, y, roi4);
+                        const ABCtype* b_row = RoiRowPtr<ABCtype>(Bv, y, roi4);
+                        const ABCtype* c_row = RoiRowPtr<ABCtype>(Cv, y, roi4);
+                        const size_t npixels = static_cast<size_t>(roi4.width());
+
+                        size_t x = 0;
+                        for (; x + lanes <= npixels; x += lanes) {
+                            const size_t off = x * 4;
+                            if constexpr (std::is_same_v<Rtype, half>) {
+                                using T16  = hwy::float16_t;
+                                auto d16   = hn::Rebind<T16, decltype(d)>();
+                                const T16* a16
+                                    = reinterpret_cast<const T16*>(a_row + off);
+                                const T16* b16
+                                    = reinterpret_cast<const T16*>(b_row + off);
+                                const T16* c16
+                                    = reinterpret_cast<const T16*>(c_row + off);
+                                T16* r16 = reinterpret_cast<T16*>(r_row + off);
+
+                                hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
+                                hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
+                                hn::Vec<decltype(d16)> cr16, cg16, cb16, ca16;
+                                hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
+                                hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
+                                                     aa16);
+                                hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
+                                                     ba16);
+                                hn::LoadInterleaved4(d16, c16, cr16, cg16, cb16,
+                                                     ca16);
+                                hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
+                                                     da16);
+                                (void)aa16;
+                                (void)ba16;
+                                (void)ca16;
+                                (void)dr16;
+                                (void)dg16;
+                                (void)db16;
+
+                                auto rr = op(d, hn::PromoteTo(d, ar16),
+                                             hn::PromoteTo(d, br16),
+                                             hn::PromoteTo(d, cr16));
+                                auto rg = op(d, hn::PromoteTo(d, ag16),
+                                             hn::PromoteTo(d, bg16),
+                                             hn::PromoteTo(d, cg16));
+                                auto rb = op(d, hn::PromoteTo(d, ab16),
+                                             hn::PromoteTo(d, bb16),
+                                             hn::PromoteTo(d, cb16));
+
+                                auto rr16 = hn::DemoteTo(d16, rr);
+                                auto rg16 = hn::DemoteTo(d16, rg);
+                                auto rb16 = hn::DemoteTo(d16, rb);
+                                hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
+                                                      r16);
+                            } else {
+                                hn::Vec<decltype(d)> ar, ag, ab, aa;
+                                hn::Vec<decltype(d)> br, bg, bb, ba;
+                                hn::Vec<decltype(d)> cr, cg, cb, ca;
+                                hn::Vec<decltype(d)> dr, dg, db, da;
+                                hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
+                                                     aa);
+                                hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
+                                                     ba);
+                                hn::LoadInterleaved4(d, c_row + off, cr, cg, cb,
+                                                     ca);
+                                hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
+                                                     da);
+                                (void)aa;
+                                (void)ba;
+                                (void)ca;
+                                (void)dr;
+                                (void)dg;
+                                (void)db;
+
+                                auto rr = op(d, ar, br, cr);
+                                auto rg = op(d, ag, bg, cg);
+                                auto rb = op(d, ab, bb, cb);
+                                hn::StoreInterleaved4(rr, rg, rb, da, d,
+                                                      r_row + off);
+                            }
+                        }
+
+                        for (; x < npixels; ++x) {
+                            const size_t off = x * 4;
+                            if constexpr (std::is_same_v<Rtype, half>) {
+                                r_row[off + 0]
+                                    = half((float)a_row[off + 0]
+                                           * (float)b_row[off + 0]
+                                           + (float)c_row[off + 0]);
+                                r_row[off + 1]
+                                    = half((float)a_row[off + 1]
+                                           * (float)b_row[off + 1]
+                                           + (float)c_row[off + 1]);
+                                r_row[off + 2]
+                                    = half((float)a_row[off + 2]
+                                           * (float)b_row[off + 2]
+                                           + (float)c_row[off + 2]);
+                            } else {
+                                r_row[off + 0] = a_row[off + 0] * b_row[off + 0]
+                                                 + c_row[off + 0];
+                                r_row[off + 1] = a_row[off + 1] * b_row[off + 1]
+                                                 + c_row[off + 1];
+                                r_row[off + 2] = a_row[off + 2] * b_row[off + 2]
+                                                 + c_row[off + 2];
+                            }
+                            // Preserve alpha (off+3).
+                        }
+                    }
+                });
+                return true;
+            }
+        }
+    }
+
     return hwy_ternary_perpixel_op<Rtype, ABCtype>(R, A, B, C, roi, nthreads,
-                                                   [](auto /*d*/, auto a, auto b,
-                                                      auto c) {
-                                                       return hn::MulAdd(a, b, c);
-                                                   });
+                                                   op);
 }
 #endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
 
@@ -79,6 +221,25 @@ mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C,
                             && ChannelsContiguous<ABCtype>(Cv, nchannels);
         if (contig)
             return mad_impl_hwy<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
+
+        // Handle the common RGBA + RGB ROI strided case (preserving alpha).
+        constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
+                                         || std::is_same_v<Rtype, double>
+                                         || std::is_same_v<Rtype, half>)
+                                        && std::is_same_v<Rtype, ABCtype>;
+        if constexpr (floaty_strided) {
+            if (roi.chbegin == 0 && roi.chend == 3) {
+                const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
+                                      && Bv.nchannels >= 4 && Cv.nchannels >= 4)
+                                     && ChannelsContiguous<Rtype>(Rv, 4)
+                                     && ChannelsContiguous<ABCtype>(Av, 4)
+                                     && ChannelsContiguous<ABCtype>(Bv, 4)
+                                     && ChannelsContiguous<ABCtype>(Cv, 4);
+                if (contig4)
+                    return mad_impl_hwy<Rtype, ABCtype>(R, A, B, C, roi,
+                                                        nthreads);
+            }
+        }
     }
 #endif
     return mad_impl_scalar<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
diff --git a/src/libOpenImageIO/imagebufalgo_muldiv.cpp b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
index bb27a05af5..3d355cf620 100644
--- a/src/libOpenImageIO/imagebufalgo_muldiv.cpp
+++ b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
@@ -131,10 +131,132 @@ static bool
 mul_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
+    auto op = [](auto /*d*/, auto a, auto b) {
+        return hn::Mul(a, b);
+    };
+
+    // Special-case: RGBA images but ROI is RGB (strided channel subset). We
+    // still can SIMD the RGB channels by processing full RGBA and preserving
+    // alpha exactly (bitwise) from the destination.
+    if (roi.chbegin == 0 && roi.chend == 3) {
+        // Only support same-type float/half/double in this fast path.
+        constexpr bool floaty = (std::is_same_v<Rtype, float>
+                                 || std::is_same_v<Rtype, double>
+                                 || std::is_same_v<Rtype, half>)
+                                && std::is_same_v<Rtype, Atype>
+                                && std::is_same_v<Rtype, Btype>;
+        if constexpr (floaty) {
+            auto Rv = HwyPixels(R);
+            auto Av = HwyPixels(A);
+            auto Bv = HwyPixels(B);
+            if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
+                && ChannelsContiguous<Rtype>(Rv, 4)
+                && ChannelsContiguous<Atype>(Av, 4)
+                && ChannelsContiguous<Btype>(Bv, 4)) {
+                ROI roi4     = roi;
+                roi4.chbegin = 0;
+                roi4.chend   = 4;
+                using MathT  = typename SimdMathType<Rtype>::type;
+                const hn::ScalableTag<MathT> d;
+                const size_t lanes = hn::Lanes(d);
+                ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
+                    for (int y = roi4.ybegin; y < roi4.yend; ++y) {
+                        Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi4);
+                        const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
+                        const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
+                        const size_t npixels = static_cast<size_t>(roi4.width());
+
+                        size_t x = 0;
+                        for (; x + lanes <= npixels; x += lanes) {
+                            const size_t off = x * 4;
+                            if constexpr (std::is_same_v<Rtype, half>) {
+                                using T16  = hwy::float16_t;
+                                auto d16   = hn::Rebind<T16, decltype(d)>();
+                                const T16* a16
+                                    = reinterpret_cast<const T16*>(a_row + off);
+                                const T16* b16
+                                    = reinterpret_cast<const T16*>(b_row + off);
+                                T16* r16 = reinterpret_cast<T16*>(r_row + off);
+
+                                hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
+                                hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
+                                hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
+                                hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
+                                                     aa16);
+                                hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
+                                                     ba16);
+                                hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
+                                                     da16);
+                                (void)aa16;
+                                (void)ba16;
+                                (void)dr16;
+                                (void)dg16;
+                                (void)db16;
+
+                                auto rr = op(d, hn::PromoteTo(d, ar16),
+                                             hn::PromoteTo(d, br16));
+                                auto rg = op(d, hn::PromoteTo(d, ag16),
+                                             hn::PromoteTo(d, bg16));
+                                auto rb = op(d, hn::PromoteTo(d, ab16),
+                                             hn::PromoteTo(d, bb16));
+
+                                auto rr16 = hn::DemoteTo(d16, rr);
+                                auto rg16 = hn::DemoteTo(d16, rg);
+                                auto rb16 = hn::DemoteTo(d16, rb);
+                                hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
+                                                      r16);
+                            } else {
+                                hn::Vec<decltype(d)> ar, ag, ab, aa;
+                                hn::Vec<decltype(d)> br, bg, bb, ba;
+                                hn::Vec<decltype(d)> dr, dg, db, da;
+                                hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
+                                                     aa);
+                                hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
+                                                     ba);
+                                hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
+                                                     da);
+                                (void)aa;
+                                (void)ba;
+                                (void)dr;
+                                (void)dg;
+                                (void)db;
+
+                                auto rr = op(d, ar, br);
+                                auto rg = op(d, ag, bg);
+                                auto rb = op(d, ab, bb);
+                                hn::StoreInterleaved4(rr, rg, rb, da, d,
+                                                      r_row + off);
+                            }
+                        }
+
+                        for (; x < npixels; ++x) {
+                            const size_t off = x * 4;
+                            if constexpr (std::is_same_v<Rtype, half>) {
+                                r_row[off + 0]
+                                    = half((float)a_row[off + 0]
+                                           * (float)b_row[off + 0]);
+                                r_row[off + 1]
+                                    = half((float)a_row[off + 1]
+                                           * (float)b_row[off + 1]);
+                                r_row[off + 2]
+                                    = half((float)a_row[off + 2]
+                                           * (float)b_row[off + 2]);
+                            } else {
+                                r_row[off + 0] = a_row[off + 0] * b_row[off + 0];
+                                r_row[off + 1] = a_row[off + 1] * b_row[off + 1];
+                                r_row[off + 2] = a_row[off + 2] * b_row[off + 2];
+                            }
+                            // Preserve alpha (off+3).
+                        }
+                    }
+                });
+                return true;
+            }
+        }
+    }
+
     return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
-                                                      [](auto /*d*/, auto a, auto b) {
-                                                          return hn::Mul(a, b);
-                                                      });
+                                                       op);
 }
 
 template<class Rtype, class Atype>
@@ -183,6 +305,25 @@ mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
                             && ChannelsContiguous<Btype>(Bv, nchannels);
         if (contig)
             return mul_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+
+        // Handle the common RGBA + RGB ROI strided case (preserving alpha).
+        constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
+                                         || std::is_same_v<Rtype, double>
+                                         || std::is_same_v<Rtype, half>)
+                                        && std::is_same_v<Rtype, Atype>
+                                        && std::is_same_v<Rtype, Btype>;
+        if constexpr (floaty_strided) {
+            if (roi.chbegin == 0 && roi.chend == 3) {
+                const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
+                                      && Bv.nchannels >= 4)
+                                     && ChannelsContiguous<Rtype>(Rv, 4)
+                                     && ChannelsContiguous<Atype>(Av, 4)
+                                     && ChannelsContiguous<Btype>(Bv, 4);
+                if (contig4)
+                    return mul_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
+                                                             nthreads);
+            }
+        }
     }
 #endif
     return mul_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
@@ -304,13 +445,149 @@ static bool
 div_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
+    auto op = [](auto d, auto a, auto b) {
+        const auto zero = hn::Zero(d);
+        const auto nz   = hn::Ne(b, zero);
+        const auto one  = hn::Set(d, 1);
+        const auto safe_b = hn::IfThenElse(nz, b, one);
+        const auto q      = hn::Div(a, safe_b);
+        return hn::IfThenElse(nz, q, zero);
+    };
+
+    // Special-case: RGBA images but ROI is RGB (strided channel subset). We
+    // still can SIMD the RGB channels by processing full RGBA and preserving
+    // alpha exactly (bitwise) from the destination.
+    if (roi.chbegin == 0 && roi.chend == 3) {
+        // Only support same-type float/half/double in this fast path.
+        constexpr bool floaty = (std::is_same_v<Rtype, float>
+                                 || std::is_same_v<Rtype, double>
+                                 || std::is_same_v<Rtype, half>)
+                                && std::is_same_v<Rtype, Atype>
+                                && std::is_same_v<Rtype, Btype>;
+        if constexpr (floaty) {
+            auto Rv = HwyPixels(R);
+            auto Av = HwyPixels(A);
+            auto Bv = HwyPixels(B);
+            if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
+                && ChannelsContiguous<Rtype>(Rv, 4)
+                && ChannelsContiguous<Atype>(Av, 4)
+                && ChannelsContiguous<Btype>(Bv, 4)) {
+                ROI roi4     = roi;
+                roi4.chbegin = 0;
+                roi4.chend   = 4;
+                using MathT  = typename SimdMathType<Rtype>::type;
+                const hn::ScalableTag<MathT> d;
+                const size_t lanes = hn::Lanes(d);
+                ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
+                    for (int y = roi4.ybegin; y < roi4.yend; ++y) {
+                        Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi4);
+                        const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
+                        const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
+                        const size_t npixels = static_cast<size_t>(roi4.width());
+
+                        size_t x = 0;
+                        for (; x + lanes <= npixels; x += lanes) {
+                            const size_t off = x * 4;
+                            if constexpr (std::is_same_v<Rtype, half>) {
+                                using T16  = hwy::float16_t;
+                                auto d16   = hn::Rebind<T16, decltype(d)>();
+                                const T16* a16
+                                    = reinterpret_cast<const T16*>(a_row + off);
+                                const T16* b16
+                                    = reinterpret_cast<const T16*>(b_row + off);
+                                T16* r16 = reinterpret_cast<T16*>(r_row + off);
+
+                                hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
+                                hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
+                                hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
+                                hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
+                                                     aa16);
+                                hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
+                                                     ba16);
+                                hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
+                                                     da16);
+                                (void)aa16;
+                                (void)ba16;
+                                (void)dr16;
+                                (void)dg16;
+                                (void)db16;
+
+                                auto rr = op(d, hn::PromoteTo(d, ar16),
+                                             hn::PromoteTo(d, br16));
+                                auto rg = op(d, hn::PromoteTo(d, ag16),
+                                             hn::PromoteTo(d, bg16));
+                                auto rb = op(d, hn::PromoteTo(d, ab16),
+                                             hn::PromoteTo(d, bb16));
+
+                                auto rr16 = hn::DemoteTo(d16, rr);
+                                auto rg16 = hn::DemoteTo(d16, rg);
+                                auto rb16 = hn::DemoteTo(d16, rb);
+                                hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
+                                                      r16);
+                            } else {
+                                hn::Vec<decltype(d)> ar, ag, ab, aa;
+                                hn::Vec<decltype(d)> br, bg, bb, ba;
+                                hn::Vec<decltype(d)> dr, dg, db, da;
+                                hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
+                                                     aa);
+                                hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
+                                                     ba);
+                                hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
+                                                     da);
+                                (void)aa;
+                                (void)ba;
+                                (void)dr;
+                                (void)dg;
+                                (void)db;
+
+                                auto rr = op(d, ar, br);
+                                auto rg = op(d, ag, bg);
+                                auto rb = op(d, ab, bb);
+                                hn::StoreInterleaved4(rr, rg, rb, da, d,
+                                                      r_row + off);
+                            }
+                        }
+
+                        for (; x < npixels; ++x) {
+                            const size_t off = x * 4;
+                            if constexpr (std::is_same_v<Rtype, half>) {
+                                const float denom0 = (float)b_row[off + 0];
+                                const float denom1 = (float)b_row[off + 1];
+                                const float denom2 = (float)b_row[off + 2];
+                                r_row[off + 0]
+                                    = (denom0 == 0.0f)
+                                          ? half(0.0f)
+                                          : half((float)a_row[off + 0] / denom0);
+                                r_row[off + 1]
+                                    = (denom1 == 0.0f)
+                                          ? half(0.0f)
+                                          : half((float)a_row[off + 1] / denom1);
+                                r_row[off + 2]
+                                    = (denom2 == 0.0f)
+                                          ? half(0.0f)
+                                          : half((float)a_row[off + 2] / denom2);
+                            } else {
+                                const auto denom0 = b_row[off + 0];
+                                const auto denom1 = b_row[off + 1];
+                                const auto denom2 = b_row[off + 2];
+                                r_row[off + 0]
+                                    = (denom0 == 0) ? 0 : (a_row[off + 0] / denom0);
+                                r_row[off + 1]
+                                    = (denom1 == 0) ? 0 : (a_row[off + 1] / denom1);
+                                r_row[off + 2]
+                                    = (denom2 == 0) ? 0 : (a_row[off + 2] / denom2);
+                            }
+                            // Preserve alpha (off+3).
+                        }
+                    }
+                });
+                return true;
+            }
+        }
+    }
+
     return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
-                                                      [](auto d, auto a, auto b) {
-                                                          auto zero = hn::Zero(d);
-                                                          auto mask = hn::Eq(b, zero);
-                                                          return hn::IfThenElse(mask, zero,
-                                                                                hn::Div(a, b));
-                                                      });
+                                                       op);
 }
 #endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
 
@@ -331,6 +608,25 @@ div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
                             && ChannelsContiguous<Btype>(Bv, nchannels);
         if (contig)
             return div_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+
+        // Handle the common RGBA + RGB ROI strided case (preserving alpha).
+        constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
+                                         || std::is_same_v<Rtype, double>
+                                         || std::is_same_v<Rtype, half>)
+                                        && std::is_same_v<Rtype, Atype>
+                                        && std::is_same_v<Rtype, Btype>;
+        if constexpr (floaty_strided) {
+            if (roi.chbegin == 0 && roi.chend == 3) {
+                const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
+                                      && Bv.nchannels >= 4)
+                                     && ChannelsContiguous<Rtype>(Rv, 4)
+                                     && ChannelsContiguous<Atype>(Av, 4)
+                                     && ChannelsContiguous<Btype>(Bv, 4);
+                if (contig4)
+                    return div_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
+                                                             nthreads);
+            }
+        }
     }
 #endif
     return div_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
diff --git a/src/libOpenImageIO/imagebufalgo_test.cpp b/src/libOpenImageIO/imagebufalgo_test.cpp
index 03900983c8..226c80cc8c 100644
--- a/src/libOpenImageIO/imagebufalgo_test.cpp
+++ b/src/libOpenImageIO/imagebufalgo_test.cpp
@@ -532,47 +532,57 @@ test_hwy_strided_roi_fallback()
 
     {
         ImageBuf R0(spec), R1(spec);
+        ImageBufAlgo::fill(R0, { 0.9f, 0.8f, 0.7f, 0.6f });
+        ImageBufAlgo::fill(R1, { 0.9f, 0.8f, 0.7f, 0.6f });
         OIIO::attribute("enable_hwy", 0);
         ImageBufAlgo::add(R0, A, B, roi);
         OIIO::attribute("enable_hwy", 1);
         ImageBufAlgo::add(R1, A, B, roi);
-        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f, roi);
+        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f);
         OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
     }
     {
         ImageBuf R0(spec), R1(spec);
+        ImageBufAlgo::fill(R0, { 0.9f, 0.8f, 0.7f, 0.6f });
+        ImageBufAlgo::fill(R1, { 0.9f, 0.8f, 0.7f, 0.6f });
         OIIO::attribute("enable_hwy", 0);
         ImageBufAlgo::sub(R0, A, B, roi);
         OIIO::attribute("enable_hwy", 1);
         ImageBufAlgo::sub(R1, A, B, roi);
-        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f, roi);
+        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f);
         OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
     }
     {
         ImageBuf R0(spec), R1(spec);
+        ImageBufAlgo::fill(R0, { 0.9f, 0.8f, 0.7f, 0.6f });
+        ImageBufAlgo::fill(R1, { 0.9f, 0.8f, 0.7f, 0.6f });
         OIIO::attribute("enable_hwy", 0);
         ImageBufAlgo::mul(R0, A, B, roi);
         OIIO::attribute("enable_hwy", 1);
         ImageBufAlgo::mul(R1, A, B, roi);
-        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f, roi);
+        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f);
         OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
     }
     {
         ImageBuf R0(spec), R1(spec);
+        ImageBufAlgo::fill(R0, { 0.9f, 0.8f, 0.7f, 0.6f });
+        ImageBufAlgo::fill(R1, { 0.9f, 0.8f, 0.7f, 0.6f });
         OIIO::attribute("enable_hwy", 0);
         ImageBufAlgo::div(R0, A, B, roi);
         OIIO::attribute("enable_hwy", 1);
         ImageBufAlgo::div(R1, A, B, roi);
-        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f, roi);
+        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f);
         OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
     }
     {
         ImageBuf R0(spec), R1(spec);
+        ImageBufAlgo::fill(R0, { 0.9f, 0.8f, 0.7f, 0.6f });
+        ImageBufAlgo::fill(R1, { 0.9f, 0.8f, 0.7f, 0.6f });
         OIIO::attribute("enable_hwy", 0);
         ImageBufAlgo::mad(R0, A, B, C, roi);
         OIIO::attribute("enable_hwy", 1);
         ImageBufAlgo::mad(R1, A, B, C, roi);
-        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f, roi);
+        auto comp = ImageBufAlgo::compare(R0, R1, 0.0f, 0.0f);
         OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
     }
 

From adb88ac70676521f9b3d6bffe3a3cc582a25cf57 Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 24 Feb 2026 16:30:20 +0900
Subject: [PATCH 12/70] Add HWY helpers for RGBA-with-RGB ROI

Replace duplicated ad-hoc SIMD special-cases in add/sub/mad with generalized HWY helpers that handle the common packed-RGBA-but-ROI-is-RGB case. Introduce PromoteVec/DemoteVec, lane-type mapping for half, interleaved Load/Store helpers (including partial-vector variants), and per-pixel/ternary routines that preserve alpha or mask it for native integer ops. Also switch HwyPixels to use pixel/scanline stride, add necessary forward declarations and includes, and simplify callers to use the new helpers, broadening support to integer/native ops and reducing code duplication.

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebufalgo_addsub.cpp | 317 ++----------
 src/libOpenImageIO/imagebufalgo_hwy_pvt.h  | 546 +++++++++++++++++++--
 src/libOpenImageIO/imagebufalgo_mad.cpp    | 168 +------
 src/libOpenImageIO/imagebufalgo_muldiv.cpp | 306 +-----------
 4 files changed, 584 insertions(+), 753 deletions(-)

diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index bea1d8259b..79b89e8204 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -86,125 +86,20 @@ add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
         return hn::Add(a, b);
     };
 
-    // Special-case: RGBA images but ROI is RGB (strided channel subset). We
-    // still can SIMD the RGB channels by processing full RGBA and preserving
-    // alpha exactly (bitwise) from the destination.
-    if (roi.chbegin == 0 && roi.chend == 3) {
-        // Only support same-type float/half/double in this fast path.
-        constexpr bool floaty = (std::is_same_v<Rtype, float>
-                                 || std::is_same_v<Rtype, double>
-                                 || std::is_same_v<Rtype, half>)
-                                && std::is_same_v<Rtype, Atype>
-                                && std::is_same_v<Rtype, Btype>;
-        if constexpr (floaty) {
-            auto Rv = HwyPixels(R);
-            auto Av = HwyPixels(A);
-            auto Bv = HwyPixels(B);
-            if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
-                && ChannelsContiguous<Rtype>(Rv, 4)
-                && ChannelsContiguous<Atype>(Av, 4)
-                && ChannelsContiguous<Btype>(Bv, 4)) {
-                ROI roi4     = roi;
-                roi4.chbegin = 0;
-                roi4.chend   = 4;
-                using MathT  = typename SimdMathType<Rtype>::type;
-                const hn::ScalableTag<MathT> d;
-                const size_t lanes = hn::Lanes(d);
-                ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
-                    for (int y = roi4.ybegin; y < roi4.yend; ++y) {
-                        Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi4);
-                        const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
-                        const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
-                        const size_t npixels = static_cast<size_t>(roi4.width());
-
-                        size_t x = 0;
-                        for (; x + lanes <= npixels; x += lanes) {
-                            const size_t off = x * 4;
-                            if constexpr (std::is_same_v<Rtype, half>) {
-                                using T16  = hwy::float16_t;
-                                auto d16   = hn::Rebind<T16, decltype(d)>();
-                                const T16* a16
-                                    = reinterpret_cast<const T16*>(a_row + off);
-                                const T16* b16
-                                    = reinterpret_cast<const T16*>(b_row + off);
-                                T16* r16 = reinterpret_cast<T16*>(r_row + off);
-
-                                hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
-                                hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
-                                hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
-                                hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
-                                                     aa16);
-                                hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
-                                                     ba16);
-                                hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
-                                                     da16);
-                                (void)aa16;
-                                (void)ba16;
-                                (void)dr16;
-                                (void)dg16;
-                                (void)db16;
-
-                                auto rr = op(d, hn::PromoteTo(d, ar16),
-                                             hn::PromoteTo(d, br16));
-                                auto rg = op(d, hn::PromoteTo(d, ag16),
-                                             hn::PromoteTo(d, bg16));
-                                auto rb = op(d, hn::PromoteTo(d, ab16),
-                                             hn::PromoteTo(d, bb16));
-
-                                auto rr16 = hn::DemoteTo(d16, rr);
-                                auto rg16 = hn::DemoteTo(d16, rg);
-                                auto rb16 = hn::DemoteTo(d16, rb);
-                                hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
-                                                      r16);
-                            } else {
-                                hn::Vec<decltype(d)> ar, ag, ab, aa;
-                                hn::Vec<decltype(d)> br, bg, bb, ba;
-                                hn::Vec<decltype(d)> dr, dg, db, da;
-                                hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
-                                                     aa);
-                                hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
-                                                     ba);
-                                hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
-                                                     da);
-                                (void)aa;
-                                (void)ba;
-                                (void)dr;
-                                (void)dg;
-                                (void)db;
-
-                                auto rr = op(d, ar, br);
-                                auto rg = op(d, ag, bg);
-                                auto rb = op(d, ab, bb);
-                                hn::StoreInterleaved4(rr, rg, rb, da, d,
-                                                      r_row + off);
-                            }
-                        }
-
-                        for (; x < npixels; ++x) {
-                            const size_t off = x * 4;
-                            if constexpr (std::is_same_v<Rtype, half>) {
-                                r_row[off + 0]
-                                    = half((float)a_row[off + 0]
-                                           + (float)b_row[off + 0]);
-                                r_row[off + 1]
-                                    = half((float)a_row[off + 1]
-                                           + (float)b_row[off + 1]);
-                                r_row[off + 2]
-                                    = half((float)a_row[off + 2]
-                                           + (float)b_row[off + 2]);
-                            } else {
-                                r_row[off + 0] = a_row[off + 0] + b_row[off + 0];
-                                r_row[off + 1] = a_row[off + 1] + b_row[off + 1];
-                                r_row[off + 2] = a_row[off + 2] + b_row[off + 2];
-                            }
-                            // Preserve alpha (off+3).
-                        }
-                    }
-                });
-                return true;
-            }
-        }
+    // Handle packed RGBA images with an RGB ROI (preserve alpha).
+    if constexpr (std::is_integral_v<Rtype> && std::is_same_v<Rtype, Atype>
+                  && std::is_same_v<Rtype, Btype>) {
+        auto op_int = [](auto /*d*/, auto a, auto b) {
+            return hn::SaturatedAdd(a, b);
+        };
+        if (hwy_binary_native_int_perpixel_op_rgba_rgb_roi<Rtype>(R, A, B, roi,
+                                                                 nthreads,
+                                                                 op_int))
+            return true;
     }
+    if (hwy_binary_perpixel_op_rgba_rgb_roi<Rtype, Atype, Btype>(R, A, B, roi,
+                                                                 nthreads, op))
+        return true;
 
     return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
                                                        op);
@@ -267,22 +162,15 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
         }
 
         // Handle the common RGBA + RGB ROI strided case (preserving alpha).
-        constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
-                                         || std::is_same_v<Rtype, double>
-                                         || std::is_same_v<Rtype, half>)
-                                        && std::is_same_v<Rtype, Atype>
-                                        && std::is_same_v<Rtype, Btype>;
-        if constexpr (floaty_strided) {
-            if (roi.chbegin == 0 && roi.chend == 3) {
-                const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
-                                      && Bv.nchannels >= 4)
-                                     && ChannelsContiguous<Rtype>(Rv, 4)
-                                     && ChannelsContiguous<Atype>(Av, 4)
-                                     && ChannelsContiguous<Btype>(Bv, 4);
-                if (contig4)
-                    return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
-                                                             nthreads);
-            }
+        if (roi.chbegin == 0 && roi.chend == 3) {
+            const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
+                                  && Bv.nchannels >= 4)
+                                 && ChannelsContiguous<Rtype>(Rv, 4)
+                                 && ChannelsContiguous<Atype>(Av, 4)
+                                 && ChannelsContiguous<Btype>(Bv, 4);
+            if (contig4)
+                return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
+                                                         nthreads);
         }
     }
 #endif
@@ -322,131 +210,31 @@ sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
         return hn::Sub(a, b);
     };
 
-    // Special-case: RGBA images but ROI is RGB (strided channel subset). We
-    // still can SIMD the RGB channels by processing full RGBA and preserving
-    // alpha exactly (bitwise) from the destination.
-    if (roi.chbegin == 0 && roi.chend == 3) {
-        // Only support same-type float/half/double in this fast path.
-        constexpr bool floaty = (std::is_same_v<Rtype, float>
-                                 || std::is_same_v<Rtype, double>
-                                 || std::is_same_v<Rtype, half>)
-                                && std::is_same_v<Rtype, Atype>
-                                && std::is_same_v<Rtype, Btype>;
-        if constexpr (floaty) {
-            auto Rv = HwyPixels(R);
-            auto Av = HwyPixels(A);
-            auto Bv = HwyPixels(B);
-            if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
-                && ChannelsContiguous<Rtype>(Rv, 4)
-                && ChannelsContiguous<Atype>(Av, 4)
-                && ChannelsContiguous<Btype>(Bv, 4)) {
-                ROI roi4     = roi;
-                roi4.chbegin = 0;
-                roi4.chend   = 4;
-                using MathT  = typename SimdMathType<Rtype>::type;
-                const hn::ScalableTag<MathT> d;
-                const size_t lanes = hn::Lanes(d);
-                ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
-                    for (int y = roi4.ybegin; y < roi4.yend; ++y) {
-                        Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi4);
-                        const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
-                        const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
-                        const size_t npixels = static_cast<size_t>(roi4.width());
-
-                        size_t x = 0;
-                        for (; x + lanes <= npixels; x += lanes) {
-                            const size_t off = x * 4;
-                            if constexpr (std::is_same_v<Rtype, half>) {
-                                using T16  = hwy::float16_t;
-                                auto d16   = hn::Rebind<T16, decltype(d)>();
-                                const T16* a16
-                                    = reinterpret_cast<const T16*>(a_row + off);
-                                const T16* b16
-                                    = reinterpret_cast<const T16*>(b_row + off);
-                                T16* r16 = reinterpret_cast<T16*>(r_row + off);
-
-                                hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
-                                hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
-                                hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
-                                hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
-                                                     aa16);
-                                hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
-                                                     ba16);
-                                hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
-                                                     da16);
-                                (void)aa16;
-                                (void)ba16;
-                                (void)dr16;
-                                (void)dg16;
-                                (void)db16;
-
-                                auto rr = op(d, hn::PromoteTo(d, ar16),
-                                             hn::PromoteTo(d, br16));
-                                auto rg = op(d, hn::PromoteTo(d, ag16),
-                                             hn::PromoteTo(d, bg16));
-                                auto rb = op(d, hn::PromoteTo(d, ab16),
-                                             hn::PromoteTo(d, bb16));
-
-                                auto rr16 = hn::DemoteTo(d16, rr);
-                                auto rg16 = hn::DemoteTo(d16, rg);
-                                auto rb16 = hn::DemoteTo(d16, rb);
-                                hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
-                                                      r16);
-                            } else {
-                                hn::Vec<decltype(d)> ar, ag, ab, aa;
-                                hn::Vec<decltype(d)> br, bg, bb, ba;
-                                hn::Vec<decltype(d)> dr, dg, db, da;
-                                hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
-                                                     aa);
-                                hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
-                                                     ba);
-                                hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
-                                                     da);
-                                (void)aa;
-                                (void)ba;
-                                (void)dr;
-                                (void)dg;
-                                (void)db;
-
-                                auto rr = op(d, ar, br);
-                                auto rg = op(d, ag, bg);
-                                auto rb = op(d, ab, bb);
-                                hn::StoreInterleaved4(rr, rg, rb, da, d,
-                                                      r_row + off);
-                            }
-                        }
-
-                        for (; x < npixels; ++x) {
-                            const size_t off = x * 4;
-                            if constexpr (std::is_same_v<Rtype, half>) {
-                                r_row[off + 0]
-                                    = half((float)a_row[off + 0]
-                                           - (float)b_row[off + 0]);
-                                r_row[off + 1]
-                                    = half((float)a_row[off + 1]
-                                           - (float)b_row[off + 1]);
-                                r_row[off + 2]
-                                    = half((float)a_row[off + 2]
-                                           - (float)b_row[off + 2]);
-                            } else {
-                                r_row[off + 0] = a_row[off + 0] - b_row[off + 0];
-                                r_row[off + 1] = a_row[off + 1] - b_row[off + 1];
-                                r_row[off + 2] = a_row[off + 2] - b_row[off + 2];
-                            }
-                            // Preserve alpha (off+3).
-                        }
-                    }
-                });
-                return true;
-            }
-        }
+    // Handle packed RGBA images with an RGB ROI (preserve alpha).
+    if constexpr (std::is_integral_v<Rtype> && std::is_same_v<Rtype, Atype>
+                  && std::is_same_v<Rtype, Btype>) {
+        auto op_int = [](auto /*d*/, auto a, auto b) {
+            return hn::SaturatedSub(a, b);
+        };
+        if (hwy_binary_native_int_perpixel_op_rgba_rgb_roi<Rtype>(R, A, B, roi,
+                                                                 nthreads,
+                                                                 op_int))
+            return true;
     }
+    if (hwy_binary_perpixel_op_rgba_rgb_roi<Rtype, Atype, Btype>(R, A, B, roi,
+                                                                 nthreads, op))
+        return true;
 
     return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
                                                        op);
 }
 #endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
 
+template<class Rtype, class Atype, class Btype>
+static bool
+sub_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads);
+
 template<class Rtype, class Atype, class Btype>
 static bool
 sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
@@ -475,22 +263,15 @@ sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
         }
 
         // Handle the common RGBA + RGB ROI strided case (preserving alpha).
-        constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
-                                         || std::is_same_v<Rtype, double>
-                                         || std::is_same_v<Rtype, half>)
-                                        && std::is_same_v<Rtype, Atype>
-                                        && std::is_same_v<Rtype, Btype>;
-        if constexpr (floaty_strided) {
-            if (roi.chbegin == 0 && roi.chend == 3) {
-                const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
-                                      && Bv.nchannels >= 4)
-                                     && ChannelsContiguous<Rtype>(Rv, 4)
-                                     && ChannelsContiguous<Atype>(Av, 4)
-                                     && ChannelsContiguous<Btype>(Bv, 4);
-                if (contig4)
-                    return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
-                                                             nthreads);
-            }
+        if (roi.chbegin == 0 && roi.chend == 3) {
+            const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
+                                  && Bv.nchannels >= 4)
+                                 && ChannelsContiguous<Rtype>(Rv, 4)
+                                 && ChannelsContiguous<Atype>(Av, 4)
+                                 && ChannelsContiguous<Btype>(Bv, 4);
+            if (contig4)
+                return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
+                                                         nthreads);
         }
     }
 #endif
diff --git a/src/libOpenImageIO/imagebufalgo_hwy_pvt.h b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
index 32069b2882..0083002615 100644
--- a/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
+++ b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
@@ -12,6 +12,7 @@
 #include <cstddef>
 #include <hwy/contrib/math/math-inl.h>
 #include <hwy/highway.h>
+#include <tuple>
 #include <type_traits>
 
 OIIO_NAMESPACE_BEGIN
@@ -37,8 +38,8 @@ HwyPixels(ImageBuf& img)
 {
     const ImageSpec& spec = img.spec();
     return { reinterpret_cast<std::byte*>(img.localpixels()),
-             spec.pixel_bytes(),
-             spec.scanline_bytes(),
+             static_cast<size_t>(img.pixel_stride()),
+             static_cast<size_t>(img.scanline_stride()),
              img.xbegin(),
              img.ybegin(),
              spec.nchannels };
@@ -49,8 +50,8 @@ HwyPixels(const ImageBuf& img)
 {
     const ImageSpec& spec = img.spec();
     return { reinterpret_cast<const std::byte*>(img.localpixels()),
-             spec.pixel_bytes(),
-             spec.scanline_bytes(),
+             static_cast<size_t>(img.pixel_stride()),
+             static_cast<size_t>(img.scanline_stride()),
              img.xbegin(),
              img.ybegin(),
              spec.nchannels };
@@ -108,6 +109,32 @@ template<> struct SimdMathType<double> {
     using type = double;
 };
 
+// Half is stored as OIIO::half in memory, but HWY's interleaved load/store uses
+// hwy::float16_t lanes.
+template<typename T> struct HwyLaneType {
+    using type = T;
+};
+template<> struct HwyLaneType<half> {
+    using type = hwy::float16_t;
+};
+template<typename T> using HwyLaneTypeT = typename HwyLaneType<T>::type;
+
+// Forward declarations (needed for templates that use these helpers before
+// their definitions later in this header).
+template<class D, typename SrcT>
+inline std::tuple<hn::Vec<D>, hn::Vec<D>, hn::Vec<D>, hn::Vec<D>>
+LoadInterleaved4Promote(D d, const SrcT* ptr);
+
+template<class D, typename SrcT>
+inline std::tuple<hn::Vec<D>, hn::Vec<D>, hn::Vec<D>, hn::Vec<D>>
+LoadInterleaved4PromoteN(D d, const SrcT* ptr, size_t count);
+
+template<class D, typename DstT, typename VecMathT, typename VecAlphaLaneT>
+inline void
+StoreInterleaved4RgbAlphaPassthrough(D d, DstT* ptr, VecMathT r, VecMathT g,
+                                     VecMathT b,
+                                     VecAlphaLaneT a_passthrough);
+
 // -----------------------------------------------------------------------
 // Load and Promote
 // -----------------------------------------------------------------------
@@ -203,6 +230,59 @@ LoadPromote(D d, const SrcT* ptr)
     }
 }
 
+/// Promote an already-loaded SIMD vector to the math type for `d`.
+/// This mirrors LoadPromote()'s conversion and normalization semantics but
+/// avoids re-loading from memory (useful for interleaved loads).
+template<class D, typename SrcT, typename VecT>
+inline auto
+PromoteVec(D d, VecT v)
+{
+    using MathT = typename D::T;
+    if constexpr (std::is_same_v<SrcT, MathT>) {
+        return v;
+    } else if constexpr (std::is_same_v<SrcT, half>) {
+        return hn::PromoteTo(d, v);
+    } else if constexpr (std::is_same_v<SrcT, uint8_t>) {
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v)));
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 255.0)));
+    } else if constexpr (std::is_same_v<SrcT, int8_t>) {
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v)));
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 127.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint16_t>) {
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v));
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 65535.0)));
+    } else if constexpr (std::is_same_v<SrcT, int16_t>) {
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v));
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 32767.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint32_t>) {
+        auto v_promoted = hn::ConvertTo(d, v);
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 4294967295.0)));
+    } else if constexpr (std::is_same_v<SrcT, int32_t>) {
+        auto v_promoted = hn::ConvertTo(d, v);
+        auto v_norm = hn::Mul(v_promoted,
+                              hn::Set(d, (MathT)(1.0 / 2147483647.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint64_t>) {
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::DemoteTo(d_u32, v);
+        return hn::ConvertTo(d, v_u32);
+    } else if constexpr (std::is_same_v<SrcT, int64_t>) {
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::DemoteTo(d_i32, v);
+        return hn::ConvertTo(d, v_i32);
+    } else {
+        return hn::Zero(d);
+    }
+}
+
 /// Load and promote partial source data to target SIMD type.
 /// Same as LoadPromote but handles partial vectors (< full lane count).
 /// @param d Highway descriptor tag defining the target SIMD type
@@ -445,6 +525,91 @@ DemoteStore(D d, DstT* ptr, VecT v)
     }
 }
 
+/// Demote a SIMD math vector to the destination lane type (no store).
+/// This mirrors DemoteStore()'s rounding/clamping semantics.
+template<class D, typename DstT, typename VecT>
+inline auto
+DemoteVec(D d, VecT v)
+{
+    using MathT = typename D::T;
+    using VecD  = hn::Vec<D>;
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        return v;
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        auto d16 = hn::Rebind<hwy::float16_t, D>();
+        return hn::DemoteTo(d16, v);
+    } else if constexpr (std::is_same_v<DstT, uint8_t>) {
+        VecD v_denorm  = hn::Mul((VecD)v, hn::Set(d, (MathT)255.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_clamped = hn::Max(v_rounded, hn::Zero(d));
+        v_clamped      = hn::Min(v_clamped, hn::Set(d, (MathT)255.0));
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_u8  = hn::Rebind<uint8_t, D>();
+        return hn::DemoteTo(d_u8, v_i16);
+    } else if constexpr (std::is_same_v<DstT, int8_t>) {
+        VecD v_denorm = hn::Mul((VecD)v, hn::Set(d, (MathT)127.0));
+        auto is_neg   = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias   = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                       hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_clamped = hn::Max(v_rounded, hn::Set(d, (MathT)-128.0));
+        v_clamped      = hn::Min(v_clamped, hn::Set(d, (MathT)127.0));
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_i8  = hn::Rebind<int8_t, D>();
+        return hn::DemoteTo(d_i8, v_i16);
+    } else if constexpr (std::is_same_v<DstT, uint16_t>) {
+        VecD v_denorm  = hn::Mul((VecD)v, hn::Set(d, (MathT)65535.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_clamped = hn::Max(v_rounded, hn::Zero(d));
+        v_clamped      = hn::Min(v_clamped, hn::Set(d, (MathT)65535.0));
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        return hn::DemoteTo(d_u16, vi32);
+    } else if constexpr (std::is_same_v<DstT, int16_t>) {
+        VecD v_denorm = hn::Mul((VecD)v, hn::Set(d, (MathT)32767.0));
+        auto is_neg   = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias   = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                       hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_clamped = hn::Max(v_rounded, hn::Set(d, (MathT)-32768.0));
+        v_clamped      = hn::Min(v_clamped, hn::Set(d, (MathT)32767.0));
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        return hn::DemoteTo(d_i16, vi32);
+    } else if constexpr (std::is_same_v<DstT, uint32_t>) {
+        VecD v_denorm  = hn::Mul((VecD)v, hn::Set(d, (MathT)4294967295.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_clamped = hn::Max(v_rounded, hn::Zero(d));
+        auto d_u32     = hn::Rebind<uint32_t, D>();
+        return hn::ConvertTo(d_u32, v_clamped);
+    } else if constexpr (std::is_same_v<DstT, int32_t>) {
+        VecD v_denorm = hn::Mul((VecD)v, hn::Set(d, (MathT)2147483647.0));
+        auto is_neg   = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias   = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                       hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_clamped = hn::Max(v_rounded, hn::Set(d, (MathT)-2147483648.0));
+        v_clamped      = hn::Min(v_clamped, hn::Set(d, (MathT)2147483647.0));
+        auto d_i32     = hn::Rebind<int32_t, D>();
+        return hn::ConvertTo(d_i32, v_clamped);
+    } else {
+        auto d_dst = hn::Rebind<DstT, D>();
+        return hn::Zero(d_dst);
+    }
+}
+
 /// Demote and store partial SIMD values to destination type.
 /// Same as DemoteStore but handles partial vectors (< full lane count).
 /// @param d Highway descriptor tag for the source SIMD type
@@ -841,6 +1006,270 @@ hwy_binary_native_int_perpixel_op(ImageBuf& R, const ImageBuf& A,
     return true;
 }
 
+// -----------------------------------------------------------------------
+// Per-pixel Ops (ImageBufAlgo, RGBA packed but ROI is RGB)
+// -----------------------------------------------------------------------
+
+/// Store only RGB results for `count` pixels into interleaved RGBA memory,
+/// leaving alpha untouched.
+template<class D, typename DstT, typename VecT>
+inline void
+StoreInterleaved3DemoteN(D d, DstT* ptr, VecT r, VecT g, VecT b, size_t count)
+{
+    DstT r_demoted[hn::MaxLanes(d)];
+    DstT g_demoted[hn::MaxLanes(d)];
+    DstT b_demoted[hn::MaxLanes(d)];
+    DemoteStoreN(d, r_demoted, r, count);
+    DemoteStoreN(d, g_demoted, g, count);
+    DemoteStoreN(d, b_demoted, b, count);
+    for (size_t i = 0; i < count; ++i) {
+        ptr[i * 4 + 0] = r_demoted[i];
+        ptr[i * 4 + 1] = g_demoted[i];
+        ptr[i * 4 + 2] = b_demoted[i];
+        // Preserve alpha (i*4+3).
+    }
+}
+
+/// Execute a binary per-pixel HWY operation for the common "packed RGBA but
+/// ROI is RGB" case. This can still SIMD by processing N pixels at a time and
+/// passing alpha through unchanged from the destination.
+template<typename Rtype, typename Atype, typename Btype, typename OpFunc>
+inline bool
+hwy_binary_perpixel_op_rgba_rgb_roi(ImageBuf& R, const ImageBuf& A,
+                                    const ImageBuf& B, ROI roi, int nthreads,
+                                    OpFunc op)
+{
+    if (roi.chbegin != 0 || roi.chend != 3)
+        return false;
+
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    if (Rv.nchannels < 4 || Av.nchannels < 4 || Bv.nchannels < 4)
+        return false;
+
+    // Require packed RGBA (no padding, exactly 4 channels).
+    if (!ChannelsContiguous<Rtype>(Rv, 4) || !ChannelsContiguous<Atype>(Av, 4)
+        || !ChannelsContiguous<Btype>(Bv, 4))
+        return false;
+
+    ROI roi4     = roi;
+    roi4.chbegin = 0;
+    roi4.chend   = 4;
+
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    const size_t lanes = hn::Lanes(d);
+
+    ImageBufAlgo::parallel_image(roi4, nthreads, [&, op](ROI roi4) {
+        for (int y = roi4.ybegin; y < roi4.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi4);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
+            const size_t npixels = static_cast<size_t>(roi4.width());
+
+            size_t x = 0;
+            for (; x + lanes <= npixels; x += lanes) {
+                const size_t off = x * 4;
+
+                auto [ar, ag, ab, aa] = LoadInterleaved4Promote(d, a_row + off);
+                auto [br, bg, bb, ba] = LoadInterleaved4Promote(d, b_row + off);
+                (void)aa;
+                (void)ba;
+
+                using DstLaneT = HwyLaneTypeT<Rtype>;
+                auto d_dstlane = hn::Rebind<DstLaneT, decltype(d)>();
+                hn::Vec<decltype(d_dstlane)> dr, dg, db, da;
+                hn::LoadInterleaved4(d_dstlane,
+                                     reinterpret_cast<const DstLaneT*>(
+                                         r_row + off),
+                                     dr, dg, db, da);
+                (void)dr;
+                (void)dg;
+                (void)db;
+
+                auto rr = op(d, ar, br);
+                auto rg = op(d, ag, bg);
+                auto rb = op(d, ab, bb);
+                StoreInterleaved4RgbAlphaPassthrough(d, r_row + off, rr, rg, rb,
+                                                     da);
+            }
+
+            const size_t remaining = npixels - x;
+            if (remaining > 0) {
+                const size_t off = x * 4;
+                auto [ar, ag, ab, aa]
+                    = LoadInterleaved4PromoteN(d, a_row + off, remaining);
+                auto [br, bg, bb, ba]
+                    = LoadInterleaved4PromoteN(d, b_row + off, remaining);
+                (void)aa;
+                (void)ba;
+                auto rr = op(d, ar, br);
+                auto rg = op(d, ag, bg);
+                auto rb = op(d, ab, bb);
+                StoreInterleaved3DemoteN(d, r_row + off, rr, rg, rb, remaining);
+            }
+        }
+    });
+
+    return true;
+}
+
+/// Execute a ternary per-pixel HWY operation for the common "packed RGBA but
+/// ROI is RGB" case, passing alpha through unchanged from the destination.
+template<typename Rtype, typename ABCtype, typename OpFunc>
+inline bool
+hwy_ternary_perpixel_op_rgba_rgb_roi(ImageBuf& R, const ImageBuf& A,
+                                     const ImageBuf& B, const ImageBuf& C,
+                                     ROI roi, int nthreads, OpFunc op)
+{
+    if (roi.chbegin != 0 || roi.chend != 3)
+        return false;
+
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    auto Cv = HwyPixels(C);
+    if (Rv.nchannels < 4 || Av.nchannels < 4 || Bv.nchannels < 4
+        || Cv.nchannels < 4)
+        return false;
+
+    if (!ChannelsContiguous<Rtype>(Rv, 4) || !ChannelsContiguous<ABCtype>(Av, 4)
+        || !ChannelsContiguous<ABCtype>(Bv, 4)
+        || !ChannelsContiguous<ABCtype>(Cv, 4))
+        return false;
+
+    ROI roi4     = roi;
+    roi4.chbegin = 0;
+    roi4.chend   = 4;
+
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    const size_t lanes = hn::Lanes(d);
+
+    ImageBufAlgo::parallel_image(roi4, nthreads, [&, op](ROI roi4) {
+        for (int y = roi4.ybegin; y < roi4.yend; ++y) {
+            Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi4);
+            const ABCtype* a_row = RoiRowPtr<ABCtype>(Av, y, roi4);
+            const ABCtype* b_row = RoiRowPtr<ABCtype>(Bv, y, roi4);
+            const ABCtype* c_row = RoiRowPtr<ABCtype>(Cv, y, roi4);
+            const size_t npixels = static_cast<size_t>(roi4.width());
+
+            size_t x = 0;
+            for (; x + lanes <= npixels; x += lanes) {
+                const size_t off = x * 4;
+
+                auto [ar, ag, ab, aa] = LoadInterleaved4Promote(d, a_row + off);
+                auto [br, bg, bb, ba] = LoadInterleaved4Promote(d, b_row + off);
+                auto [cr, cg, cb, ca] = LoadInterleaved4Promote(d, c_row + off);
+                (void)aa;
+                (void)ba;
+                (void)ca;
+
+                using DstLaneT = HwyLaneTypeT<Rtype>;
+                auto d_dstlane = hn::Rebind<DstLaneT, decltype(d)>();
+                hn::Vec<decltype(d_dstlane)> dr, dg, db, da;
+                hn::LoadInterleaved4(d_dstlane,
+                                     reinterpret_cast<const DstLaneT*>(
+                                         r_row + off),
+                                     dr, dg, db, da);
+                (void)dr;
+                (void)dg;
+                (void)db;
+
+                auto rr = op(d, ar, br, cr);
+                auto rg = op(d, ag, bg, cg);
+                auto rb = op(d, ab, bb, cb);
+                StoreInterleaved4RgbAlphaPassthrough(d, r_row + off, rr, rg, rb,
+                                                     da);
+            }
+
+            const size_t remaining = npixels - x;
+            if (remaining > 0) {
+                const size_t off = x * 4;
+                auto [ar, ag, ab, aa]
+                    = LoadInterleaved4PromoteN(d, a_row + off, remaining);
+                auto [br, bg, bb, ba]
+                    = LoadInterleaved4PromoteN(d, b_row + off, remaining);
+                auto [cr, cg, cb, ca]
+                    = LoadInterleaved4PromoteN(d, c_row + off, remaining);
+                (void)aa;
+                (void)ba;
+                (void)ca;
+                auto rr = op(d, ar, br, cr);
+                auto rg = op(d, ag, bg, cg);
+                auto rb = op(d, ab, bb, cb);
+                StoreInterleaved3DemoteN(d, r_row + off, rr, rg, rb, remaining);
+            }
+        }
+    });
+
+    return true;
+}
+
+/// Execute a native integer binary operation for the "packed RGBA but ROI is
+/// RGB" case. This operates on the raw integer channel values and preserves
+/// alpha by masking.
+template<typename T, typename OpFunc>
+inline bool
+hwy_binary_native_int_perpixel_op_rgba_rgb_roi(ImageBuf& R, const ImageBuf& A,
+                                               const ImageBuf& B, ROI roi,
+                                               int nthreads, OpFunc op)
+{
+    if (roi.chbegin != 0 || roi.chend != 3)
+        return false;
+
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    if (Rv.nchannels < 4 || Av.nchannels < 4 || Bv.nchannels < 4)
+        return false;
+    if (!ChannelsContiguous<T>(Rv, 4) || !ChannelsContiguous<T>(Av, 4)
+        || !ChannelsContiguous<T>(Bv, 4))
+        return false;
+
+    const hn::ScalableTag<T> d;
+    const size_t lanes = hn::Lanes(d);
+    if (lanes & 3)
+        return false;  // need block alignment so the alpha mask stays aligned
+
+    const auto three      = hn::Set(d, T(3));
+    const auto lane_index = hn::Iota(d, T(0));
+    const auto alpha_mask = hn::Eq(hn::And(lane_index, three), three);
+
+    ROI roi4     = roi;
+    roi4.chbegin = 0;
+    roi4.chend   = 4;
+    ImageBufAlgo::parallel_image(roi4, nthreads, [&, op](ROI roi4) {
+        const size_t n = static_cast<size_t>(roi4.width()) * 4;
+        for (int y = roi4.ybegin; y < roi4.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi4);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi4);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi4);
+            size_t i = 0;
+            for (; i + lanes <= n; i += lanes) {
+                auto va   = hn::Load(d, a_row + i);
+                auto vb   = hn::Load(d, b_row + i);
+                auto vold = hn::Load(d, r_row + i);
+                auto vnew = op(d, va, vb);
+                auto vres = hn::IfThenElse(alpha_mask, vold, vnew);
+                hn::Store(vres, d, r_row + i);
+            }
+            const size_t remaining = n - i;
+            if (remaining > 0) {
+                auto va   = hn::LoadN(d, a_row + i, remaining);
+                auto vb   = hn::LoadN(d, b_row + i, remaining);
+                auto vold = hn::LoadN(d, r_row + i, remaining);
+                auto vnew = op(d, va, vb);
+                auto vres = hn::IfThenElse(alpha_mask, vold, vnew);
+                hn::StoreN(vres, d, r_row + i, remaining);
+            }
+        }
+    });
+
+    return true;
+}
+
 // -----------------------------------------------------------------------
 // Interleaved Channel Load/Store Helpers
 // -----------------------------------------------------------------------
@@ -852,7 +1281,7 @@ hwy_binary_native_int_perpixel_op(ImageBuf& R, const ImageBuf& A,
 /// @param ptr Pointer to interleaved RGBA data (R0,G0,B0,A0,R1,G1,B1,A1,...)
 /// @return Tuple of (R, G, B, A) SIMD vectors in promoted type
 template<class D, typename SrcT>
-inline auto
+inline std::tuple<hn::Vec<D>, hn::Vec<D>, hn::Vec<D>, hn::Vec<D>>
 LoadInterleaved4Promote(D d, const SrcT* ptr)
 {
     using MathT = typename D::T;
@@ -881,30 +1310,41 @@ LoadInterleaved4Promote(D d, const SrcT* ptr)
 
         return std::make_tuple(r_vec, g_vec, b_vec, a_vec);
     } else {
-        // Generic type promotion - deinterleave manually with normalization
-        const size_t N = hn::Lanes(d);
-        SrcT r_src[hn::MaxLanes(d)];
-        SrcT g_src[hn::MaxLanes(d)];
-        SrcT b_src[hn::MaxLanes(d)];
-        SrcT a_src[hn::MaxLanes(d)];
-
-        for (size_t i = 0; i < N; ++i) {
-            r_src[i] = ptr[i * 4 + 0];
-            g_src[i] = ptr[i * 4 + 1];
-            b_src[i] = ptr[i * 4 + 2];
-            a_src[i] = ptr[i * 4 + 3];
-        }
-
-        // Use LoadPromote for proper normalization of integer types
-        auto r_vec = LoadPromote(d, r_src);
-        auto g_vec = LoadPromote(d, g_src);
-        auto b_vec = LoadPromote(d, b_src);
-        auto a_vec = LoadPromote(d, a_src);
-
+        // Generic type promotion - deinterleave with HWY then promote.
+        auto d_src = hn::Rebind<SrcT, D>();
+        hn::Vec<decltype(d_src)> r_src, g_src, b_src, a_src;
+        hn::LoadInterleaved4(d_src, ptr, r_src, g_src, b_src, a_src);
+        auto r_vec = PromoteVec<D, SrcT>(d, r_src);
+        auto g_vec = PromoteVec<D, SrcT>(d, g_src);
+        auto b_vec = PromoteVec<D, SrcT>(d, b_src);
+        auto a_vec = PromoteVec<D, SrcT>(d, a_src);
         return std::make_tuple(r_vec, g_vec, b_vec, a_vec);
     }
 }
 
+/// Load 4 interleaved channels (RGBA) with type promotion for a partial vector
+/// (count pixels, count <= lanes).
+template<class D, typename SrcT>
+inline std::tuple<hn::Vec<D>, hn::Vec<D>, hn::Vec<D>, hn::Vec<D>>
+LoadInterleaved4PromoteN(D d, const SrcT* ptr, size_t count)
+{
+    SrcT r_src[hn::MaxLanes(d)];
+    SrcT g_src[hn::MaxLanes(d)];
+    SrcT b_src[hn::MaxLanes(d)];
+    SrcT a_src[hn::MaxLanes(d)];
+    for (size_t i = 0; i < count; ++i) {
+        r_src[i] = ptr[i * 4 + 0];
+        g_src[i] = ptr[i * 4 + 1];
+        b_src[i] = ptr[i * 4 + 2];
+        a_src[i] = ptr[i * 4 + 3];
+    }
+    auto r_vec = LoadPromoteN(d, r_src, count);
+    auto g_vec = LoadPromoteN(d, g_src, count);
+    auto b_vec = LoadPromoteN(d, b_src, count);
+    auto a_vec = LoadPromoteN(d, a_src, count);
+    return std::make_tuple(r_vec, g_vec, b_vec, a_vec);
+}
+
 /// Store 4 interleaved channels (RGBA) with type demotion.
 /// For matching types, uses Highway's native StoreInterleaved4.
 /// For type demotion, manually interleaves and stores.
@@ -938,28 +1378,40 @@ StoreInterleaved4Demote(D d, DstT* ptr, VecT r, VecT g, VecT b, VecT a)
         hn::StoreInterleaved4(r16, g16, b16, a16, d16,
                               reinterpret_cast<T16*>(ptr));
     } else {
-        // Generic type demotion - use DemoteStore for each channel then interleave
-        const size_t N = hn::Lanes(d);
-
-        // Temporary arrays for demoted values
-        DstT r_demoted[hn::MaxLanes(d)];
-        DstT g_demoted[hn::MaxLanes(d)];
-        DstT b_demoted[hn::MaxLanes(d)];
-        DstT a_demoted[hn::MaxLanes(d)];
-
-        // Use DemoteStoreN to properly denormalize integer types
-        DemoteStoreN(d, r_demoted, r, N);
-        DemoteStoreN(d, g_demoted, g, N);
-        DemoteStoreN(d, b_demoted, b, N);
-        DemoteStoreN(d, a_demoted, a, N);
-
-        // Interleave the demoted values
-        for (size_t i = 0; i < N; ++i) {
-            ptr[i * 4 + 0] = r_demoted[i];
-            ptr[i * 4 + 1] = g_demoted[i];
-            ptr[i * 4 + 2] = b_demoted[i];
-            ptr[i * 4 + 3] = a_demoted[i];
-        }
+        // Generic type demotion - demote to lane vectors and use HWY's interleaved store.
+        auto d_dst = hn::Rebind<DstT, D>();
+        auto r_dst = DemoteVec<D, DstT>(d, r);
+        auto g_dst = DemoteVec<D, DstT>(d, g);
+        auto b_dst = DemoteVec<D, DstT>(d, b);
+        auto a_dst = DemoteVec<D, DstT>(d, a);
+        hn::StoreInterleaved4(r_dst, g_dst, b_dst, a_dst, d_dst, ptr);
+    }
+}
+
+/// Store 4 interleaved channels (RGBA) demoting RGB from math type and passing
+/// alpha through unchanged (alpha is already in the destination lane type).
+template<class D, typename DstT, typename VecMathT, typename VecAlphaLaneT>
+inline void
+StoreInterleaved4RgbAlphaPassthrough(D d, DstT* ptr, VecMathT r, VecMathT g,
+                                     VecMathT b, VecAlphaLaneT a_passthrough)
+{
+    using MathT = typename D::T;
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        hn::StoreInterleaved4(r, g, b, a_passthrough, d, ptr);
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+        auto r16  = hn::DemoteTo(d16, r);
+        auto g16  = hn::DemoteTo(d16, g);
+        auto b16  = hn::DemoteTo(d16, b);
+        hn::StoreInterleaved4(r16, g16, b16, a_passthrough, d16,
+                              reinterpret_cast<T16*>(ptr));
+    } else {
+        auto d_dst = hn::Rebind<DstT, D>();
+        auto r_dst = DemoteVec<D, DstT>(d, r);
+        auto g_dst = DemoteVec<D, DstT>(d, g);
+        auto b_dst = DemoteVec<D, DstT>(d, b);
+        hn::StoreInterleaved4(r_dst, g_dst, b_dst, a_passthrough, d_dst, ptr);
     }
 }
 
diff --git a/src/libOpenImageIO/imagebufalgo_mad.cpp b/src/libOpenImageIO/imagebufalgo_mad.cpp
index f8f3e19ddf..eb20c81f0b 100644
--- a/src/libOpenImageIO/imagebufalgo_mad.cpp
+++ b/src/libOpenImageIO/imagebufalgo_mad.cpp
@@ -56,146 +56,9 @@ mad_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
         return hn::MulAdd(a, b, c);
     };
 
-    // Special-case: RGBA images but ROI is RGB (strided channel subset). We
-    // still can SIMD the RGB channels by processing full RGBA and preserving
-    // alpha exactly (bitwise) from the destination.
-    if (roi.chbegin == 0 && roi.chend == 3) {
-        // Only support same-type float/half/double in this fast path.
-        constexpr bool floaty = (std::is_same_v<Rtype, float>
-                                 || std::is_same_v<Rtype, double>
-                                 || std::is_same_v<Rtype, half>)
-                                && std::is_same_v<Rtype, ABCtype>;
-        if constexpr (floaty) {
-            auto Rv = HwyPixels(R);
-            auto Av = HwyPixels(A);
-            auto Bv = HwyPixels(B);
-            auto Cv = HwyPixels(C);
-            if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
-                && Cv.nchannels >= 4 && ChannelsContiguous<Rtype>(Rv, 4)
-                && ChannelsContiguous<ABCtype>(Av, 4)
-                && ChannelsContiguous<ABCtype>(Bv, 4)
-                && ChannelsContiguous<ABCtype>(Cv, 4)) {
-                ROI roi4     = roi;
-                roi4.chbegin = 0;
-                roi4.chend   = 4;
-                using MathT  = typename SimdMathType<Rtype>::type;
-                const hn::ScalableTag<MathT> d;
-                const size_t lanes = hn::Lanes(d);
-                ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
-                    for (int y = roi4.ybegin; y < roi4.yend; ++y) {
-                        Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi4);
-                        const ABCtype* a_row = RoiRowPtr<ABCtype>(Av, y, roi4);
-                        const ABCtype* b_row = RoiRowPtr<ABCtype>(Bv, y, roi4);
-                        const ABCtype* c_row = RoiRowPtr<ABCtype>(Cv, y, roi4);
-                        const size_t npixels = static_cast<size_t>(roi4.width());
-
-                        size_t x = 0;
-                        for (; x + lanes <= npixels; x += lanes) {
-                            const size_t off = x * 4;
-                            if constexpr (std::is_same_v<Rtype, half>) {
-                                using T16  = hwy::float16_t;
-                                auto d16   = hn::Rebind<T16, decltype(d)>();
-                                const T16* a16
-                                    = reinterpret_cast<const T16*>(a_row + off);
-                                const T16* b16
-                                    = reinterpret_cast<const T16*>(b_row + off);
-                                const T16* c16
-                                    = reinterpret_cast<const T16*>(c_row + off);
-                                T16* r16 = reinterpret_cast<T16*>(r_row + off);
-
-                                hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
-                                hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
-                                hn::Vec<decltype(d16)> cr16, cg16, cb16, ca16;
-                                hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
-                                hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
-                                                     aa16);
-                                hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
-                                                     ba16);
-                                hn::LoadInterleaved4(d16, c16, cr16, cg16, cb16,
-                                                     ca16);
-                                hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
-                                                     da16);
-                                (void)aa16;
-                                (void)ba16;
-                                (void)ca16;
-                                (void)dr16;
-                                (void)dg16;
-                                (void)db16;
-
-                                auto rr = op(d, hn::PromoteTo(d, ar16),
-                                             hn::PromoteTo(d, br16),
-                                             hn::PromoteTo(d, cr16));
-                                auto rg = op(d, hn::PromoteTo(d, ag16),
-                                             hn::PromoteTo(d, bg16),
-                                             hn::PromoteTo(d, cg16));
-                                auto rb = op(d, hn::PromoteTo(d, ab16),
-                                             hn::PromoteTo(d, bb16),
-                                             hn::PromoteTo(d, cb16));
-
-                                auto rr16 = hn::DemoteTo(d16, rr);
-                                auto rg16 = hn::DemoteTo(d16, rg);
-                                auto rb16 = hn::DemoteTo(d16, rb);
-                                hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
-                                                      r16);
-                            } else {
-                                hn::Vec<decltype(d)> ar, ag, ab, aa;
-                                hn::Vec<decltype(d)> br, bg, bb, ba;
-                                hn::Vec<decltype(d)> cr, cg, cb, ca;
-                                hn::Vec<decltype(d)> dr, dg, db, da;
-                                hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
-                                                     aa);
-                                hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
-                                                     ba);
-                                hn::LoadInterleaved4(d, c_row + off, cr, cg, cb,
-                                                     ca);
-                                hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
-                                                     da);
-                                (void)aa;
-                                (void)ba;
-                                (void)ca;
-                                (void)dr;
-                                (void)dg;
-                                (void)db;
-
-                                auto rr = op(d, ar, br, cr);
-                                auto rg = op(d, ag, bg, cg);
-                                auto rb = op(d, ab, bb, cb);
-                                hn::StoreInterleaved4(rr, rg, rb, da, d,
-                                                      r_row + off);
-                            }
-                        }
-
-                        for (; x < npixels; ++x) {
-                            const size_t off = x * 4;
-                            if constexpr (std::is_same_v<Rtype, half>) {
-                                r_row[off + 0]
-                                    = half((float)a_row[off + 0]
-                                           * (float)b_row[off + 0]
-                                           + (float)c_row[off + 0]);
-                                r_row[off + 1]
-                                    = half((float)a_row[off + 1]
-                                           * (float)b_row[off + 1]
-                                           + (float)c_row[off + 1]);
-                                r_row[off + 2]
-                                    = half((float)a_row[off + 2]
-                                           * (float)b_row[off + 2]
-                                           + (float)c_row[off + 2]);
-                            } else {
-                                r_row[off + 0] = a_row[off + 0] * b_row[off + 0]
-                                                 + c_row[off + 0];
-                                r_row[off + 1] = a_row[off + 1] * b_row[off + 1]
-                                                 + c_row[off + 1];
-                                r_row[off + 2] = a_row[off + 2] * b_row[off + 2]
-                                                 + c_row[off + 2];
-                            }
-                            // Preserve alpha (off+3).
-                        }
-                    }
-                });
-                return true;
-            }
-        }
-    }
+    if (hwy_ternary_perpixel_op_rgba_rgb_roi<Rtype, ABCtype>(
+            R, A, B, C, roi, nthreads, op))
+        return true;
 
     return hwy_ternary_perpixel_op<Rtype, ABCtype>(R, A, B, C, roi, nthreads,
                                                    op);
@@ -223,22 +86,15 @@ mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C,
             return mad_impl_hwy<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
 
         // Handle the common RGBA + RGB ROI strided case (preserving alpha).
-        constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
-                                         || std::is_same_v<Rtype, double>
-                                         || std::is_same_v<Rtype, half>)
-                                        && std::is_same_v<Rtype, ABCtype>;
-        if constexpr (floaty_strided) {
-            if (roi.chbegin == 0 && roi.chend == 3) {
-                const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
-                                      && Bv.nchannels >= 4 && Cv.nchannels >= 4)
-                                     && ChannelsContiguous<Rtype>(Rv, 4)
-                                     && ChannelsContiguous<ABCtype>(Av, 4)
-                                     && ChannelsContiguous<ABCtype>(Bv, 4)
-                                     && ChannelsContiguous<ABCtype>(Cv, 4);
-                if (contig4)
-                    return mad_impl_hwy<Rtype, ABCtype>(R, A, B, C, roi,
-                                                        nthreads);
-            }
+        if (roi.chbegin == 0 && roi.chend == 3) {
+            const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
+                                  && Bv.nchannels >= 4 && Cv.nchannels >= 4)
+                                 && ChannelsContiguous<Rtype>(Rv, 4)
+                                 && ChannelsContiguous<ABCtype>(Av, 4)
+                                 && ChannelsContiguous<ABCtype>(Bv, 4)
+                                 && ChannelsContiguous<ABCtype>(Cv, 4);
+            if (contig4)
+                return mad_impl_hwy<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
         }
     }
 #endif
diff --git a/src/libOpenImageIO/imagebufalgo_muldiv.cpp b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
index 3d355cf620..cbfeb07d75 100644
--- a/src/libOpenImageIO/imagebufalgo_muldiv.cpp
+++ b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
@@ -135,125 +135,9 @@ mul_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
         return hn::Mul(a, b);
     };
 
-    // Special-case: RGBA images but ROI is RGB (strided channel subset). We
-    // still can SIMD the RGB channels by processing full RGBA and preserving
-    // alpha exactly (bitwise) from the destination.
-    if (roi.chbegin == 0 && roi.chend == 3) {
-        // Only support same-type float/half/double in this fast path.
-        constexpr bool floaty = (std::is_same_v<Rtype, float>
-                                 || std::is_same_v<Rtype, double>
-                                 || std::is_same_v<Rtype, half>)
-                                && std::is_same_v<Rtype, Atype>
-                                && std::is_same_v<Rtype, Btype>;
-        if constexpr (floaty) {
-            auto Rv = HwyPixels(R);
-            auto Av = HwyPixels(A);
-            auto Bv = HwyPixels(B);
-            if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
-                && ChannelsContiguous<Rtype>(Rv, 4)
-                && ChannelsContiguous<Atype>(Av, 4)
-                && ChannelsContiguous<Btype>(Bv, 4)) {
-                ROI roi4     = roi;
-                roi4.chbegin = 0;
-                roi4.chend   = 4;
-                using MathT  = typename SimdMathType<Rtype>::type;
-                const hn::ScalableTag<MathT> d;
-                const size_t lanes = hn::Lanes(d);
-                ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
-                    for (int y = roi4.ybegin; y < roi4.yend; ++y) {
-                        Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi4);
-                        const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
-                        const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
-                        const size_t npixels = static_cast<size_t>(roi4.width());
-
-                        size_t x = 0;
-                        for (; x + lanes <= npixels; x += lanes) {
-                            const size_t off = x * 4;
-                            if constexpr (std::is_same_v<Rtype, half>) {
-                                using T16  = hwy::float16_t;
-                                auto d16   = hn::Rebind<T16, decltype(d)>();
-                                const T16* a16
-                                    = reinterpret_cast<const T16*>(a_row + off);
-                                const T16* b16
-                                    = reinterpret_cast<const T16*>(b_row + off);
-                                T16* r16 = reinterpret_cast<T16*>(r_row + off);
-
-                                hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
-                                hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
-                                hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
-                                hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
-                                                     aa16);
-                                hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
-                                                     ba16);
-                                hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
-                                                     da16);
-                                (void)aa16;
-                                (void)ba16;
-                                (void)dr16;
-                                (void)dg16;
-                                (void)db16;
-
-                                auto rr = op(d, hn::PromoteTo(d, ar16),
-                                             hn::PromoteTo(d, br16));
-                                auto rg = op(d, hn::PromoteTo(d, ag16),
-                                             hn::PromoteTo(d, bg16));
-                                auto rb = op(d, hn::PromoteTo(d, ab16),
-                                             hn::PromoteTo(d, bb16));
-
-                                auto rr16 = hn::DemoteTo(d16, rr);
-                                auto rg16 = hn::DemoteTo(d16, rg);
-                                auto rb16 = hn::DemoteTo(d16, rb);
-                                hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
-                                                      r16);
-                            } else {
-                                hn::Vec<decltype(d)> ar, ag, ab, aa;
-                                hn::Vec<decltype(d)> br, bg, bb, ba;
-                                hn::Vec<decltype(d)> dr, dg, db, da;
-                                hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
-                                                     aa);
-                                hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
-                                                     ba);
-                                hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
-                                                     da);
-                                (void)aa;
-                                (void)ba;
-                                (void)dr;
-                                (void)dg;
-                                (void)db;
-
-                                auto rr = op(d, ar, br);
-                                auto rg = op(d, ag, bg);
-                                auto rb = op(d, ab, bb);
-                                hn::StoreInterleaved4(rr, rg, rb, da, d,
-                                                      r_row + off);
-                            }
-                        }
-
-                        for (; x < npixels; ++x) {
-                            const size_t off = x * 4;
-                            if constexpr (std::is_same_v<Rtype, half>) {
-                                r_row[off + 0]
-                                    = half((float)a_row[off + 0]
-                                           * (float)b_row[off + 0]);
-                                r_row[off + 1]
-                                    = half((float)a_row[off + 1]
-                                           * (float)b_row[off + 1]);
-                                r_row[off + 2]
-                                    = half((float)a_row[off + 2]
-                                           * (float)b_row[off + 2]);
-                            } else {
-                                r_row[off + 0] = a_row[off + 0] * b_row[off + 0];
-                                r_row[off + 1] = a_row[off + 1] * b_row[off + 1];
-                                r_row[off + 2] = a_row[off + 2] * b_row[off + 2];
-                            }
-                            // Preserve alpha (off+3).
-                        }
-                    }
-                });
-                return true;
-            }
-        }
-    }
+    if (hwy_binary_perpixel_op_rgba_rgb_roi<Rtype, Atype, Btype>(R, A, B, roi,
+                                                                 nthreads, op))
+        return true;
 
     return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
                                                        op);
@@ -307,22 +191,15 @@ mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
             return mul_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
 
         // Handle the common RGBA + RGB ROI strided case (preserving alpha).
-        constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
-                                         || std::is_same_v<Rtype, double>
-                                         || std::is_same_v<Rtype, half>)
-                                        && std::is_same_v<Rtype, Atype>
-                                        && std::is_same_v<Rtype, Btype>;
-        if constexpr (floaty_strided) {
-            if (roi.chbegin == 0 && roi.chend == 3) {
-                const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
-                                      && Bv.nchannels >= 4)
-                                     && ChannelsContiguous<Rtype>(Rv, 4)
-                                     && ChannelsContiguous<Atype>(Av, 4)
-                                     && ChannelsContiguous<Btype>(Bv, 4);
-                if (contig4)
-                    return mul_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
-                                                             nthreads);
-            }
+        if (roi.chbegin == 0 && roi.chend == 3) {
+            const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
+                                  && Bv.nchannels >= 4)
+                                 && ChannelsContiguous<Rtype>(Rv, 4)
+                                 && ChannelsContiguous<Atype>(Av, 4)
+                                 && ChannelsContiguous<Btype>(Bv, 4);
+            if (contig4)
+                return mul_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
+                                                         nthreads);
         }
     }
 #endif
@@ -454,137 +331,9 @@ div_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
         return hn::IfThenElse(nz, q, zero);
     };
 
-    // Special-case: RGBA images but ROI is RGB (strided channel subset). We
-    // still can SIMD the RGB channels by processing full RGBA and preserving
-    // alpha exactly (bitwise) from the destination.
-    if (roi.chbegin == 0 && roi.chend == 3) {
-        // Only support same-type float/half/double in this fast path.
-        constexpr bool floaty = (std::is_same_v<Rtype, float>
-                                 || std::is_same_v<Rtype, double>
-                                 || std::is_same_v<Rtype, half>)
-                                && std::is_same_v<Rtype, Atype>
-                                && std::is_same_v<Rtype, Btype>;
-        if constexpr (floaty) {
-            auto Rv = HwyPixels(R);
-            auto Av = HwyPixels(A);
-            auto Bv = HwyPixels(B);
-            if (Rv.nchannels >= 4 && Av.nchannels >= 4 && Bv.nchannels >= 4
-                && ChannelsContiguous<Rtype>(Rv, 4)
-                && ChannelsContiguous<Atype>(Av, 4)
-                && ChannelsContiguous<Btype>(Bv, 4)) {
-                ROI roi4     = roi;
-                roi4.chbegin = 0;
-                roi4.chend   = 4;
-                using MathT  = typename SimdMathType<Rtype>::type;
-                const hn::ScalableTag<MathT> d;
-                const size_t lanes = hn::Lanes(d);
-                ImageBufAlgo::parallel_image(roi4, nthreads, [&](ROI roi4) {
-                    for (int y = roi4.ybegin; y < roi4.yend; ++y) {
-                        Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi4);
-                        const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
-                        const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
-                        const size_t npixels = static_cast<size_t>(roi4.width());
-
-                        size_t x = 0;
-                        for (; x + lanes <= npixels; x += lanes) {
-                            const size_t off = x * 4;
-                            if constexpr (std::is_same_v<Rtype, half>) {
-                                using T16  = hwy::float16_t;
-                                auto d16   = hn::Rebind<T16, decltype(d)>();
-                                const T16* a16
-                                    = reinterpret_cast<const T16*>(a_row + off);
-                                const T16* b16
-                                    = reinterpret_cast<const T16*>(b_row + off);
-                                T16* r16 = reinterpret_cast<T16*>(r_row + off);
-
-                                hn::Vec<decltype(d16)> ar16, ag16, ab16, aa16;
-                                hn::Vec<decltype(d16)> br16, bg16, bb16, ba16;
-                                hn::Vec<decltype(d16)> dr16, dg16, db16, da16;
-                                hn::LoadInterleaved4(d16, a16, ar16, ag16, ab16,
-                                                     aa16);
-                                hn::LoadInterleaved4(d16, b16, br16, bg16, bb16,
-                                                     ba16);
-                                hn::LoadInterleaved4(d16, r16, dr16, dg16, db16,
-                                                     da16);
-                                (void)aa16;
-                                (void)ba16;
-                                (void)dr16;
-                                (void)dg16;
-                                (void)db16;
-
-                                auto rr = op(d, hn::PromoteTo(d, ar16),
-                                             hn::PromoteTo(d, br16));
-                                auto rg = op(d, hn::PromoteTo(d, ag16),
-                                             hn::PromoteTo(d, bg16));
-                                auto rb = op(d, hn::PromoteTo(d, ab16),
-                                             hn::PromoteTo(d, bb16));
-
-                                auto rr16 = hn::DemoteTo(d16, rr);
-                                auto rg16 = hn::DemoteTo(d16, rg);
-                                auto rb16 = hn::DemoteTo(d16, rb);
-                                hn::StoreInterleaved4(rr16, rg16, rb16, da16, d16,
-                                                      r16);
-                            } else {
-                                hn::Vec<decltype(d)> ar, ag, ab, aa;
-                                hn::Vec<decltype(d)> br, bg, bb, ba;
-                                hn::Vec<decltype(d)> dr, dg, db, da;
-                                hn::LoadInterleaved4(d, a_row + off, ar, ag, ab,
-                                                     aa);
-                                hn::LoadInterleaved4(d, b_row + off, br, bg, bb,
-                                                     ba);
-                                hn::LoadInterleaved4(d, r_row + off, dr, dg, db,
-                                                     da);
-                                (void)aa;
-                                (void)ba;
-                                (void)dr;
-                                (void)dg;
-                                (void)db;
-
-                                auto rr = op(d, ar, br);
-                                auto rg = op(d, ag, bg);
-                                auto rb = op(d, ab, bb);
-                                hn::StoreInterleaved4(rr, rg, rb, da, d,
-                                                      r_row + off);
-                            }
-                        }
-
-                        for (; x < npixels; ++x) {
-                            const size_t off = x * 4;
-                            if constexpr (std::is_same_v<Rtype, half>) {
-                                const float denom0 = (float)b_row[off + 0];
-                                const float denom1 = (float)b_row[off + 1];
-                                const float denom2 = (float)b_row[off + 2];
-                                r_row[off + 0]
-                                    = (denom0 == 0.0f)
-                                          ? half(0.0f)
-                                          : half((float)a_row[off + 0] / denom0);
-                                r_row[off + 1]
-                                    = (denom1 == 0.0f)
-                                          ? half(0.0f)
-                                          : half((float)a_row[off + 1] / denom1);
-                                r_row[off + 2]
-                                    = (denom2 == 0.0f)
-                                          ? half(0.0f)
-                                          : half((float)a_row[off + 2] / denom2);
-                            } else {
-                                const auto denom0 = b_row[off + 0];
-                                const auto denom1 = b_row[off + 1];
-                                const auto denom2 = b_row[off + 2];
-                                r_row[off + 0]
-                                    = (denom0 == 0) ? 0 : (a_row[off + 0] / denom0);
-                                r_row[off + 1]
-                                    = (denom1 == 0) ? 0 : (a_row[off + 1] / denom1);
-                                r_row[off + 2]
-                                    = (denom2 == 0) ? 0 : (a_row[off + 2] / denom2);
-                            }
-                            // Preserve alpha (off+3).
-                        }
-                    }
-                });
-                return true;
-            }
-        }
-    }
+    if (hwy_binary_perpixel_op_rgba_rgb_roi<Rtype, Atype, Btype>(R, A, B, roi,
+                                                                 nthreads, op))
+        return true;
 
     return hwy_binary_perpixel_op<Rtype, Atype, Btype>(R, A, B, roi, nthreads,
                                                        op);
@@ -610,22 +359,15 @@ div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
             return div_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
 
         // Handle the common RGBA + RGB ROI strided case (preserving alpha).
-        constexpr bool floaty_strided = (std::is_same_v<Rtype, float>
-                                         || std::is_same_v<Rtype, double>
-                                         || std::is_same_v<Rtype, half>)
-                                        && std::is_same_v<Rtype, Atype>
-                                        && std::is_same_v<Rtype, Btype>;
-        if constexpr (floaty_strided) {
-            if (roi.chbegin == 0 && roi.chend == 3) {
-                const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
-                                      && Bv.nchannels >= 4)
-                                     && ChannelsContiguous<Rtype>(Rv, 4)
-                                     && ChannelsContiguous<Atype>(Av, 4)
-                                     && ChannelsContiguous<Btype>(Bv, 4);
-                if (contig4)
-                    return div_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
-                                                             nthreads);
-            }
+        if (roi.chbegin == 0 && roi.chend == 3) {
+            const bool contig4 = (Rv.nchannels >= 4 && Av.nchannels >= 4
+                                  && Bv.nchannels >= 4)
+                                 && ChannelsContiguous<Rtype>(Rv, 4)
+                                 && ChannelsContiguous<Atype>(Av, 4)
+                                 && ChannelsContiguous<Btype>(Bv, 4);
+            if (contig4)
+                return div_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi,
+                                                         nthreads);
         }
     }
 #endif

From 7efd58b2e35218a8b582f4cc155617fd55666490 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Mon, 29 Dec 2025 17:36:38 -0800
Subject: [PATCH 13/70] docs: fix missing docs for `OIIO:attribute()` and
 `OIIO::getattribute()` (#4987)

Rearrangements in 3.1 dropped the list of recognized attributes from
the visible online docs and failed to document the span varieties. We
fix and also reword a lot of the descriptions for clarity and uniformity.

The previous organization was that there were several varieties of attribute(). In the header, the first one had the overall long explanation, including the list of all the recognized attributes. The other ones had short explanations of how they differed. In the docs, each one was referenced explicitly, pulling in its attendant bit of documentation.

What really happened is that in the header, I made the new span-based version the "flagship" one with the full explanation, but I neglected to reference it in the docs, so the long description disappeared.

I could have fixed by just adding refs to the new functions to the docs, as I originally meant to. But while I was there, I took the opportunity to surround the whole collection with a group marker, and then include the lot of them with a single reference to the group, rather than need to refer to each function variant individually. And while I was at it, I also reworded (and hopefully improved) some of those explanations.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/doc/imageioapi.rst            |  45 +------
 src/include/OpenImageIO/imageio.h | 188 +++++++++++++++++++++---------
 2 files changed, 135 insertions(+), 98 deletions(-)

diff --git a/src/doc/imageioapi.rst b/src/doc/imageioapi.rst
index dca5a66da5..1c57c39383 100644
--- a/src/doc/imageioapi.rst
+++ b/src/doc/imageioapi.rst
@@ -215,49 +215,12 @@ These helper functions are not part of any other OpenImageIO class, they
 just exist in the OpenImageIO namespace as general utilities. (See
 :ref:`sec-pythonmiscapi` for the corresponding Python bindings.)
 
-.. doxygenfunction:: OIIO::attribute(string_view, TypeDesc, const void *)
-
-.. cpp:function:: bool OIIO::attribute(string_view name, int val)
-                  bool OIIO::attribute(string_view name, float val)
-                  bool OIIO::attribute(string_view name, string_view val)
-
-    Shortcuts for setting an attribute to a single int, float, or string.
-
-
-.. doxygenfunction:: OIIO::getattribute(string_view, TypeDesc, void *)
-
-
-.. cpp:function:: bool getattribute (string_view name, int &val)
-                  bool getattribute (string_view name, float &val)
-                  bool getattribute (string_view name, char **val)
-                  bool getattribute (string_view name, std::string& val)
-
-    Specialized versions of `getattribute()` in which the data type is
-    implied by the type of the argument (for single int, float, or string).
-    Two string versions exist: one that retrieves it as a `std::string` and
-    another that retrieves it as a `char *`. In all cases, the return value
-    is `true` if the attribute is found and the requested data type
-    conversion was legal.
-
-    EXAMPLES::
-
-        int threads;
-        OIIO::getattribute ("threads", &threads);
-        std::string path;
-        OIIO::getattribute ("plugin_searchpath", path);
-
-.. cpp:function:: int get_int_attribute (string_view name, int defaultvalue=0)
-                  float get_float_attribute (string_view name, float defaultvalue=0)
-                  string_view get_string_attribute (string_view name, string_view defaultvalue="")
-
-    Specialized versions of `getattribute()` for common types, in which the
-    data is returned directly, and a supplied default value is returned if
-    the attribute was not found.
+.. doxygengroup:: OIIO_attribute
+..
 
-    EXAMPLES::
 
-        int threads = OIIO::get_int_attribute ("threads", 0);
-        string_view path = OIIO::get_string_attribute ("plugin_searchpath");
+.. doxygengroup:: OIIO_getattribute
+..
 
 
diff --git a/src/include/OpenImageIO/imageio.h b/src/include/OpenImageIO/imageio.h
index e6c1ab0552..4b9a05304b 100644
--- a/src/include/OpenImageIO/imageio.h
+++ b/src/include/OpenImageIO/imageio.h
@@ -3697,18 +3697,28 @@ OIIO_API bool has_error();
 /// error messages.
 OIIO_API std::string geterror(bool clear = true);
 
-/// `OIIO::attribute()` sets a global attribute (i.e., a property or
-/// option) of OpenImageIO. The `name` designates the name of the attribute,
-/// `type` describes the type of data, and `value` is a pointer to memory
-/// containing the new value for the attribute.
+/// @defgroup OIIO_attribute (global OIIO::attribute())
+/// @{
 ///
-/// If the name is known, valid attribute that matches the type specified,
-/// the attribute will be set to the new value and `attribute()` will return
-/// `true`.  If `name` is not recognized, or if the types do not match
-/// (e.g., `type` is `TypeFloat` but the named attribute is a string), the
-/// attribute will not be modified, and `attribute()` will return `false`.
+/// `OIIO::attribute()` sets a global attribute (i.e., a property or option)
+/// of OpenImageIO. The `name` designates the name of the attribute, `value`
+/// is the value to use for the attribute, and for some varieties of the call,
+/// `type` is a TypeDesc describing the data type.
 ///
-/// The following are the recognized attributes:
+/// Most varieties of the call will return `true` if `name` is a known
+/// attribute and its expected type is compatible with the type specified. If
+/// `name` is not recognized, or if the types do not match (e.g., `type` is
+/// `TypeFloat` but the named attribute is supposed to be a string), the
+/// internal attribute will not be modified, and `attribute()` will return
+/// `false`.
+///
+/// In all cases, is up to the caller to ensure that `value` is or refers to
+/// the right kind and size of storage for the given type.
+///
+/// Note that all attributes set by this call may also be retrieved by
+/// `OIIO::getattribute()`.
+///
+/// RECOGNIZED ATTRIBUTES
 ///
 /// - `string options`
 ///
@@ -3927,7 +3937,25 @@ OIIO_API std::string geterror(bool clear = true);
 ///   enable globally in an environment where security is a higher priority
 ///   than being tolerant of partially broken image files.
 ///
-/// @version 3.1
+/// EXAMPLES:
+/// ```
+///     // Setting single simple values simply:
+///     bool ok = OIIO::getattribute("threads", 1);  // implied: int
+///     ok = OIIO::attribute("plugin_searchpath", "/foo/bar:/baz");  // implied: string
+///
+///     // Setting a more complex value using a span, with explicit type
+///     float missing[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
+///     ok = OIIO::attribute("missingcolor", TypeDesc("float[4]"), make_span(missing));
+/// ```
+///
+/// The different varieties of `OIIO::attribute()` call follow:
+
+/// Set the attribute's value from a span (which may be a single value). The
+/// total size of `value` must match the `type` (if not, an assertion will be
+/// thrown for debug builds of OIIO, an error will be printed for release
+/// builds).
+///
+/// @version 3.1+
 template<typename T>
 inline bool attribute(string_view name, TypeDesc type, span<T> value)
 {
@@ -3936,19 +3964,18 @@ inline bool attribute(string_view name, TypeDesc type, span<T> value)
     return attribute(name, type, OIIO::as_bytes(value));
 }
 
-/// A version of `OIIO::attribute()` that takes its value from a span of
-/// untyped bytes. The total size of `value` must match the `type` (if not, an
-/// assertion will be thrown for debug builds of OIIO, an error will be
-/// printed for release builds).
+/// Set the attribute's value from a span of untyped bytes. The total size of
+/// `value` must match the `type` (if not, an assertion will be thrown for
+/// debug builds of OIIO, an error will be printed for release builds).
 ///
-/// @version 3.1
+/// @version 3.1+
 OIIO_API bool attribute(string_view name, TypeDesc type, cspan<std::byte> value);
 
-/// A version of `OIIO::attribute()` where the `value` is only a pointer
-/// specifying the beginning of the memory where the value should be copied
-/// from. This is "unsafe" in the sense that there is no assurance that it
-/// points to a sufficient amount of memory, so the span-based versions of
-/// `attribute()` are preferred.
+/// Set the named attribute to the contents of memory pointed to by `value`,
+/// with the `type` implying the total size to be copied. This is "unsafe" in
+/// the sense that there is no assurance that it points to a sufficient amount
+/// of memory or value type, so the span-based versions of `attribute()` are
+/// preferred.
 ///
 /// This was added in version 2.1.
 OIIO_API bool attribute(string_view name, TypeDesc type, const void* value);
@@ -3967,12 +3994,23 @@ inline bool attribute(string_view name, string_view value) {
     const char *s = valstr.c_str();
     return attribute(name, TypeString, &s);
 }
+/// @}
+
 
-/// Get the named global attribute of OpenImageIO, store it in `value`.
-/// Return `true` if found and it was compatible with the type specified,
-/// otherwise return `false` and do not modify the contents of `value`.  It
-/// is up to the caller to ensure that `val` points to the right kind and
-/// size of storage for the given type.
+/// @defgroup OIIO_getattribute (global OIIO::getattribute())
+/// @{
+///
+/// `OIIO::getattribute()` retrieves a named global attribute of OpenImageIO,
+/// and stores it in `value`. These are the retrieval side of the symmetric
+/// set of `OIIO::attribute()` calls.
+///
+/// Most varieties of the call will return `true` if the named attribute was
+/// found and it was compatible with the type specified, otherwise return
+/// `false` and do not modify the contents of `value`.  In all cases, it is up
+/// to the caller to ensure that `val` points to the right kind and size of
+/// storage for the given type.
+///
+/// RECOGNIZED ATTRIBUTES
 ///
 /// In addition to being able to retrieve all the attributes that are
 /// documented as settable by the `OIIO::attribute()` call, `getattribute()`
@@ -4104,8 +4142,32 @@ inline bool attribute(string_view name, string_view value) {
 ///        IBA::resize                  20   0.24s   (avg  12.18ms)
 ///        IBA::zero                     8   0.66ms  (avg   0.08ms)
 ///
+/// EXAMPLES:
+/// ```
+///     // Retrieving a single simple value with success/failure return:
+///     int threads;
+///     bool ok = OIIO::getattribute("threads", threads);
+///     std::string path;
+///     ok = OIIO::getattribute("plugin_searchpath", path);
 ///
-/// @version 3.1
+///     // Directly returning a single simple value, with default to use
+///     // if the attribute is not found:
+///     int threads = OIIO::get_int_attribute("threads", 0);
+///     string_view path = OIIO::get_string_attribute("plugin_searchpath");
+///
+///     // Returning into a span, with explicit type
+///     float missing[4];
+///     ok = OIIO::getattribute("missingcolor", TypeDesc("float[4]"),
+///                             make_span(missing));
+/// ```
+///
+/// The different varieties of `OIIO::getattribute()` call follow:
+
+/// Store the named attribute's current value into a writable span. The total
+/// size of `value` must match the `type` (if not, an assertion will be thrown
+/// for debug OIIO builds, an error will be printed for release builds).
+///
+/// @version 3.1+
 template<typename T>
 inline bool getattribute(string_view name, TypeDesc type, span<T> value)
 {
@@ -4114,37 +4176,37 @@ inline bool getattribute(string_view name, TypeDesc type, span<T> value)
     return OIIO::v3_1::getattribute(name, type, OIIO::as_writable_bytes(value));
 }
 
-/// A version of `getattribute()` that stores the value in a span of
-/// untyped bytes. The total size of `value` must match the `type` (if
-/// not, an assertion will be thrown for debug OIIO builds, an error will
-/// be printed for release builds).
+/// Store the value in a span of untyped bytes. The total size of `value` must
+/// match the `type` (if not, an assertion will be thrown for debug OIIO
+/// builds, an error will be printed for release builds).
 ///
-/// @version 3.1
+/// @version 3.1+
 OIIO_API bool getattribute(string_view name, TypeDesc type,
                            span<std::byte> value);
 
-/// A version of `OIIO::getattribute()` where the `value` is only a pointer
-/// specifying the beginning of the memory where the value should be copied.
-/// This is "unsafe" in the sense that there is no assurance that it points to
-/// a sufficient amount of memory, so the span-based versions of `attribute()`
-/// are preferred.
+/// Store the value into memory pointed to by `val`. This is "unsafe" in the
+/// sense that there is no assurance that it points to a sufficient amount of
+/// memory or will be interpreted as the correct type, so the span-based
+/// versions of `attribute()` are preferred.
 OIIO_API bool getattribute(string_view name, TypeDesc type, void* val);
 
-/// Shortcut getattribute() for retrieving a single integer. The value is
-/// placed in `value`, and the function returns `true` if the attribute was
-/// found and was legally convertible to an int.
+/// Retrieve a single-integer attribute. The value is placed in `value`, and
+/// the function returns `true` if the attribute was found and was legally
+/// convertible to an int.
 inline bool getattribute (string_view name, int &value) {
     return getattribute (name, TypeInt, &value);
 }
-/// Shortcut getattribute() for retrieving a single float. The value is placed
-/// in `value`, and the function returns `true` if the attribute was found and
-/// was legally convertible to a float.
+
+/// Retrieve a single-float attribute. The value is placed in `value`, and the
+/// function returns `true` if the attribute was found and was legally
+/// convertible to a float.
 inline bool getattribute (string_view name, float &value) {
     return getattribute (name, TypeFloat, &value);
 }
-/// Shortcut getattribute() for retrieving a single string as a `std::string`.
-/// The value is placed in `value`, and the function returns `true` if the
-/// attribute was found.
+
+/// Retrieve a single-string attribute, placed as a `std::string` into
+/// `value`, and the function returns `true` if the attribute was found and
+/// was legally convertible to an string.
 inline bool getattribute (string_view name, std::string &value) {
     ustring s;
     bool ok = getattribute (name, TypeString, &s);
@@ -4152,32 +4214,44 @@ inline bool getattribute (string_view name, std::string &value) {
         value = s.string();
     return ok;
 }
-/// Shortcut getattribute() for retrieving a single string as a `char*`.
-inline bool getattribute (string_view name, char **val) {
-    return getattribute (name, TypeString, val);
+
+/// Retrieve a single-string attribute, placed as a `const char*` into
+/// `*value`, and the function returns `true` if the attribute was found and
+/// was legally convertible to an string. Note that the `const char*`
+/// retrieved is really the characters belonging to a `ustring`, and so is
+/// owned by OIIO and should not be freed by the calling code.
+inline bool getattribute (string_view name, char **value) {
+    return getattribute (name, TypeString, value);
 }
-/// Shortcut getattribute() for retrieving a single integer, with a supplied
-/// default value that will be returned if the attribute is not found or
-/// could not legally be converted to an int.
+
+/// Retrieve a single-integer attribute, with a supplied default value that
+/// will be returned if the attribute is not found or could not legally be
+/// converted to an int.
 inline int get_int_attribute (string_view name, int defaultval=0) {
     int val;
     return getattribute (name, TypeInt, &val) ? val : defaultval;
 }
-/// Shortcut getattribute() for retrieving a single float, with a supplied
-/// default value that will be returned if the attribute is not found or
-/// could not legally be converted to a float.
+
+/// Retrieve a single-float attribute, with a supplied default value that
+/// will be returned if the attribute is not found or could not legally be
+/// converted to a float.
 inline float get_float_attribute (string_view name, float defaultval=0) {
     float val;
     return getattribute (name, TypeFloat, &val) ? val : defaultval;
 }
-/// Shortcut getattribute() for retrieving a single string, with a supplied
-/// default value that will be returned if the attribute is not found.
+
+/// Retrieve a single-string attribute, with a supplied default value that
+/// will be returned if the attribute is not found or could not legally be
+/// converted to an int. default value that will be returned if the attribute
+/// is not found.
 inline string_view get_string_attribute (string_view name,
                                  string_view defaultval = string_view()) {
     ustring val;
     return getattribute (name, TypeString, &val) ? string_view(val) : defaultval;
 }
 
+/// @}
+
 
 /// Set the metadata of the `spec` to presume that color space is `name` (or
 /// to assume nothing about the color space if `name` is empty). The core

From 737fdf47ea78cf9848d21a716ed03b06335c58de Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Tue, 30 Dec 2025 11:27:23 -0800
Subject: [PATCH 14/70] deps: Test against libraw 0.21.5 (#4988)

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml            | 8 ++++----
 INSTALL.md                          | 2 +-
 src/build-scripts/build_libraw.bash | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index eeba96601d..31d7e647ea 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -460,7 +460,7 @@ jobs:
             simd: avx2,f16c
             setenvs: export LIBJPEGTURBO_VERSION=3.1.2
                             LIBPNG_VERSION=v1.6.50
-                            LIBRAW_VERSION=0.21.4
+                            LIBRAW_VERSION=0.21.5
                             LIBTIFF_VERSION=v4.7.1
                             OPENJPEG_VERSION=v2.5.4
                             PTEX_VERSION=v2.5.0
@@ -513,7 +513,7 @@ jobs:
             simd: avx2,f16c
             setenvs: export OpenImageIO_BUILD_LOCAL_DEPS=all
                             OpenImageIO_DEPENDENCY_BUILD_VERBOSE=ON
-                            LIBRAW_VERSION=0.21.4
+                            LIBRAW_VERSION=0.21.5
                             PTEX_VERSION=v2.4.2
                             PUGIXML_VERSION=v1.14
                             WEBP_VERSION=v1.4.0
@@ -543,7 +543,7 @@ jobs:
             python_ver: "3.12"
             setenvs: export LIBJPEGTURBO_VERSION=3.1.2
                             LIBPNG_VERSION=v1.6.50
-                            LIBRAW_VERSION=0.21.4
+                            LIBRAW_VERSION=0.21.5
                             LIBTIFF_VERSION=v4.7.1
                             OPENJPEG_VERSION=v2.5.4
                             PTEX_VERSION=v2.4.3
@@ -564,7 +564,7 @@ jobs:
             python_ver: "3.12"
             setenvs: export LIBJPEGTURBO_VERSION=3.1.2
                             LIBPNG_VERSION=v1.6.50
-                            LIBRAW_VERSION=0.21.4
+                            LIBRAW_VERSION=0.21.5
                             LIBTIFF_VERSION=v4.7.1
                             OPENJPEG_VERSION=v2.5.4
                             PTEX_VERSION=v2.4.3
diff --git a/INSTALL.md b/INSTALL.md
index 68d69aa67e..becea32e95 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -47,7 +47,7 @@ NEW or CHANGED MINIMUM dependencies since the last major release are **bold**.
  * If you want support for PNG files:
      * libPNG >= 1.6.0 (tested though 1.6.50)
  * If you want support for camera "RAW" formats:
-     * LibRaw >= 0.20 (tested though 0.21.4 and master)
+     * LibRaw >= 0.20 (tested though 0.21.5 and master)
  * If you want support for a wide variety of video formats:
      * ffmpeg >= 4.0 (tested through 8.0)
  * If you want support for jpeg 2000 images:
diff --git a/src/build-scripts/build_libraw.bash b/src/build-scripts/build_libraw.bash
index b5114d3aab..2b4b3bef92 100755
--- a/src/build-scripts/build_libraw.bash
+++ b/src/build-scripts/build_libraw.bash
@@ -11,7 +11,7 @@ set -ex
 
 # Which LibRaw to retrieve, how to build it
 LIBRAW_REPO=${LIBRAW_REPO:=https://github.com/LibRaw/LibRaw.git}
-LIBRAW_VERSION=${LIBRAW_VERSION:=0.21.4}
+LIBRAW_VERSION=${LIBRAW_VERSION:=0.21.5}
 
 # Where to install the final results
 LOCAL_DEPS_DIR=${LOCAL_DEPS_DIR:=${PWD}/ext}

From e2fe4040a6b73c09cdeb769dd03d76377f5733b2 Mon Sep 17 00:00:00 2001
From: Jesse Yurkovich <jesse.y@gmail.com>
Date: Tue, 30 Dec 2025 23:45:23 -0800
Subject: [PATCH 15/70] feat(sgi): Implement RLE encoding support for output
 (#4990)

Implement RLE compression support for the SGI output plugin. Reading RLE
encoded images was already supported, but writing was never done up
until this point.

The existing sgi test seems sufficient to catch issues and it covers
input/output of both 1 byte-per-pixel and 2 byte-per-pixel files.

The documentation for the image plugins are sometimes not very clear
about which attributes are relevant for input vs. output. There's
usually 3 sections: Attributes, Attributes for Input, and Attributes for
Output.

Before this PR, SGI mentioned the "compression" attribute in the
"general" Attributes section (rather than say just the Input section),
which caused a bit of grief as the only way to discover that RLE was not
implemented for Output was to glance at the file size of the resulting
file... I had assumed that compression was supported for output too but
discovered that it was not.

Now that this PR implements the attribute for output I've left the
documentation as-is in the "general" Attributes section since it applies
to both read/writing now. But I'm open for suggestions here.

Signed-off-by: Jesse Yurkovich <jesse.y@gmail.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/sgi.imageio/sgioutput.cpp | 249 +++++++++++++++++++++++++++++++---
 1 file changed, 228 insertions(+), 21 deletions(-)

diff --git a/src/sgi.imageio/sgioutput.cpp b/src/sgi.imageio/sgioutput.cpp
index 2d3c9f358d..0eb0928cd3 100644
--- a/src/sgi.imageio/sgioutput.cpp
+++ b/src/sgi.imageio/sgioutput.cpp
@@ -26,12 +26,19 @@ class SgiOutput final : public ImageOutput {
     std::string m_filename;
     std::vector<unsigned char> m_scratch;
     unsigned int m_dither;
-    std::vector<unsigned char> m_tilebuffer;
+    bool m_want_rle;
+    std::vector<unsigned char> m_uncompressed_image;
 
     void init() { ioproxy_clear(); }
 
     bool create_and_write_header();
 
+    bool write_scanline_raw(int y, const unsigned char* data);
+    bool write_scanline_rle(int y, const unsigned char* data, int64_t& offset,
+                            std::vector<int>& start_table,
+                            std::vector<int>& length_table);
+    bool write_buffered_pixels();
+
     /// Helper - write, with error detection
     template<class T>
     bool fwrite(const T* buf, size_t itemsize = sizeof(T), size_t nitems = 1)
@@ -85,10 +92,12 @@ SgiOutput::open(const std::string& name, const ImageSpec& spec, OpenMode mode)
                    ? m_spec.get_int_attribute("oiio:dither", 0)
                    : 0;
 
+    m_want_rle = m_spec.get_string_attribute("compression") == "rle";
+
     // If user asked for tiles -- which this format doesn't support, emulate
-    // it by buffering the whole image.
-    if (m_spec.tile_width && m_spec.tile_height)
-        m_tilebuffer.resize(m_spec.image_bytes());
+    // it by buffering the whole image. RLE is treated similarly.
+    if (m_want_rle || (m_spec.tile_width && m_spec.tile_height))
+        m_uncompressed_image.resize(m_spec.image_bytes());
 
     return create_and_write_header();
 }
@@ -102,32 +111,57 @@ SgiOutput::write_scanline(int y, int z, TypeDesc format, const void* data,
     y    = m_spec.height - y - 1;
     data = to_native_scanline(format, data, xstride, m_scratch, m_dither, y, z);
 
+    // If we are writing RLE data, just copy into the uncompressed buffer
+    if (m_want_rle) {
+        const auto scaneline_size = m_spec.scanline_bytes();
+        memcpy(&m_uncompressed_image[y * scaneline_size], data, scaneline_size);
+
+        return true;
+    }
+
+    return write_scanline_raw(y, (const unsigned char*)data);
+}
+
+
+
+bool
+SgiOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
+                      stride_t xstride, stride_t ystride, stride_t zstride)
+{
+    // Emulate tiles by buffering the whole image
+    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
+                                     zstride, &m_uncompressed_image[0]);
+}
+
+
+
+bool
+SgiOutput::write_scanline_raw(int y, const unsigned char* data)
+{
     // In SGI format all channels are saved to file separately: first, all
     // channel 1 scanlines are saved, then all channel2 scanlines are saved
     // and so on.
-    //
-    // Note that since SGI images are pretty archaic and most probably
-    // people won't be too picky about full flexibility writing them, we
-    // content ourselves with only writing uncompressed data, and don't
-    // attempt to write with RLE encoding.
 
     size_t bpc = m_spec.format.size();  // bytes per channel
     std::unique_ptr<unsigned char[]> channeldata(
         new unsigned char[m_spec.width * bpc]);
 
     for (int64_t c = 0; c < m_spec.nchannels; ++c) {
-        unsigned char* cdata = (unsigned char*)data + c * bpc;
+        const unsigned char* cdata = data + c * bpc;
         for (int64_t x = 0; x < m_spec.width; ++x) {
             channeldata[x * bpc] = cdata[0];
             if (bpc == 2)
                 channeldata[x * bpc + 1] = cdata[1];
             cdata += m_spec.nchannels * bpc;  // advance to next pixel
         }
+
         if (bpc == 2 && littleendian())
             swap_endian((unsigned short*)&channeldata[0], m_spec.width);
+
         ptrdiff_t scanline_offset = sgi_pvt::SGI_HEADER_LEN
                                     + ptrdiff_t(c * m_spec.height + y)
                                           * m_spec.width * bpc;
+
         ioseek(scanline_offset);
         if (!iowrite(&channeldata[0], 1, m_spec.width * bpc)) {
             return false;
@@ -139,13 +173,179 @@ SgiOutput::write_scanline(int y, int z, TypeDesc format, const void* data,
 
 
+static bool
+data_equals(const unsigned char* data, int bpc, imagesize_t off1,
+            imagesize_t off2)
+{
+    if (bpc == 1) {
+        return data[off1] == data[off2];
+    } else {
+        return data[off1] == data[off2] && data[off1 + 1] == data[off2 + 1];
+    }
+}
+
+
+
+static void
+data_set(unsigned char* data, int bpc, imagesize_t off,
+         const unsigned char* val)
+{
+    if (bpc == 1) {
+        data[off] = val[0];
+    } else {
+        data[off]     = val[1];
+        data[off + 1] = val[0];
+    }
+}
+
+
+
+static void
+data_set(unsigned char* data, int bpc, imagesize_t off, const short val)
+{
+    if (bpc == 1) {
+        data[off] = static_cast<unsigned char>(val);
+    } else {
+        data[off]     = static_cast<unsigned char>(val >> 8);
+        data[off + 1] = static_cast<unsigned char>(val & 0xFF);
+    }
+}
+
+
+
 bool
-SgiOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                      stride_t xstride, stride_t ystride, stride_t zstride)
+SgiOutput::write_scanline_rle(int y, const unsigned char* data, int64_t& offset,
+                              std::vector<int>& offset_table,
+                              std::vector<int>& length_table)
 {
-    // Emulate tiles by buffering the whole image
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, &m_tilebuffer[0]);
+    const size_t bpc     = m_spec.format.size();  // bytes per channel
+    const size_t xstride = m_spec.nchannels * bpc;
+    const imagesize_t scanline_bytes = m_spec.scanline_bytes();
+
+    // Account for the worst case length when every pixel is different
+    m_scratch.resize(bpc * (m_spec.width + (m_spec.width / 127 + 2)));
+
+    for (int64_t c = 0; c < m_spec.nchannels; ++c) {
+        const unsigned char* cdata = data + c * bpc;
+
+        imagesize_t out = 0;
+        imagesize_t pos = 0;
+        while (pos < scanline_bytes) {
+            imagesize_t start = pos;
+            // Find the first run meeting a minimum length of 3
+            imagesize_t ahead_1 = pos + xstride;
+            imagesize_t ahead_2 = pos + xstride * 2;
+            while (ahead_2 < scanline_bytes
+                   && (!data_equals(cdata, bpc, ahead_1, ahead_2)
+                       || !data_equals(cdata, bpc, pos, ahead_1))) {
+                pos += xstride;
+                ahead_1 += xstride;
+                ahead_2 += xstride;
+            }
+            if (ahead_2 >= scanline_bytes) {
+                // No more runs, just dump the rest as literals
+                pos = scanline_bytes;
+            }
+            int count = int((pos - start) / xstride);
+            while (count) {
+                int todo = (count > 127) ? 127 : count;
+                count -= todo;
+                data_set(m_scratch.data(), bpc, out, 0x80 | todo);
+                out += bpc;
+                while (todo) {
+                    data_set(m_scratch.data(), bpc, out, cdata + start);
+                    out += bpc;
+                    start += xstride;
+                    todo -= 1;
+                }
+            }
+            start = pos;
+            if (start >= scanline_bytes)
+                break;
+            pos += xstride;
+            while (pos < scanline_bytes
+                   && data_equals(cdata, bpc, start, pos)) {
+                pos += xstride;
+            }
+            count = int((pos - start) / xstride);
+            while (count) {
+                int curr_run = (count > 127) ? 127 : count;
+                count -= curr_run;
+                data_set(m_scratch.data(), bpc, out, curr_run);
+                out += bpc;
+                data_set(m_scratch.data(), bpc, out, cdata + start);
+                out += bpc;
+            }
+        }
+        data_set(m_scratch.data(), bpc, out, short(0));
+        out += bpc;
+
+        // Fill in details about the scanline
+        const int table_index     = c * m_spec.height + y;
+        offset_table[table_index] = static_cast<int>(offset);
+        length_table[table_index] = static_cast<int>(out);
+
+        // Write the compressed data
+        if (!iowrite(&m_scratch[0], 1, out))
+            return false;
+        offset += out;
+    }
+
+    return true;
+}
+
+
+
+bool
+SgiOutput::write_buffered_pixels()
+{
+    OIIO_ASSERT(m_uncompressed_image.size());
+
+    const auto scanline_bytes = m_spec.scanline_bytes();
+    if (m_want_rle) {
+        // Prepare RLE tables
+        const int64_t table_size       = m_spec.height * m_spec.nchannels;
+        const int64_t table_size_bytes = table_size * sizeof(int);
+        std::vector<int> offset_table;
+        std::vector<int> length_table;
+        offset_table.resize(table_size);
+        length_table.resize(table_size);
+
+        // Skip over the tables and start at the data area
+        int64_t offset = sgi_pvt::SGI_HEADER_LEN + 2 * table_size_bytes;
+        ioseek(offset);
+
+        // Write RLE compressed data
+        for (int y = 0; y < m_spec.height; ++y) {
+            const unsigned char* scanline_data
+                = &m_uncompressed_image[y * scanline_bytes];
+            if (!write_scanline_rle(y, scanline_data, offset, offset_table,
+                                    length_table))
+                return false;
+        }
+
+        // Write the tables now that they're filled in with offsets/lengths
+        ioseek(sgi_pvt::SGI_HEADER_LEN);
+        if (littleendian()) {
+            swap_endian(&offset_table[0], table_size);
+            swap_endian(&length_table[0], table_size);
+        }
+        if (!iowrite(&offset_table[0], 1, table_size_bytes))
+            return false;
+        if (!iowrite(&length_table[0], 1, table_size_bytes))
+            return false;
+
+    } else {
+        // Write raw data
+        for (int y = 0; y < m_spec.height; ++y) {
+            unsigned char* scanline_data
+                = &m_uncompressed_image[y * scanline_bytes];
+            if (!write_scanline_raw(y, scanline_data))
+                return false;
+        }
+    }
+
+    return true;
 }
 
 
@@ -160,15 +360,22 @@ SgiOutput::close()
 
     bool ok = true;
     if (m_spec.tile_width) {
-        // Handle tile emulation -- output the buffered pixels
-        OIIO_ASSERT(m_tilebuffer.size());
+        // We've been emulating tiles; now dump as scanlines.
+        OIIO_ASSERT(m_uncompressed_image.size());
         ok &= write_scanlines(m_spec.y, m_spec.y + m_spec.height, 0,
-                              m_spec.format, &m_tilebuffer[0]);
-        m_tilebuffer.clear();
-        m_tilebuffer.shrink_to_fit();
+                              m_spec.format, &m_uncompressed_image[0]);
     }
 
+    // If we want RLE encoding or we were tiled, output all the processed scanlines now.
+    if (ok && (m_want_rle || m_spec.tile_width)) {
+        ok &= write_buffered_pixels();
+    }
+
+    m_uncompressed_image.clear();
+    m_uncompressed_image.shrink_to_fit();
+
     init();
+
     return ok;
 }
 
@@ -179,7 +386,7 @@ SgiOutput::create_and_write_header()
 {
     sgi_pvt::SgiHeader sgi_header;
     sgi_header.magic   = sgi_pvt::SGI_MAGIC;
-    sgi_header.storage = sgi_pvt::VERBATIM;
+    sgi_header.storage = m_want_rle ? sgi_pvt::RLE : sgi_pvt::VERBATIM;
     sgi_header.bpc     = m_spec.format.size();
 
     if (m_spec.height == 1 && m_spec.nchannels == 1)

From 8a640bd50c366af15e128e1847c92236784dc25b Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Thu, 1 Jan 2026 11:03:27 -0800
Subject: [PATCH 16/70] docs: Update CHANGES (#4991)

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 CHANGES.md | 106 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 91 insertions(+), 15 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 79d3eb436e..f1ed5b64c0 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -10,29 +10,54 @@ Release 3.2 (target: Sept 2026?) -- compared to 3.1
 * *ImageCache/TextureSystem*:
 * New global attribute queries via OIIO::getattribute():
 * Miscellaneous API changes:
-    - *api*: Versioned namespace to preserve ABI compatibility between minor releases [#4869](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4869) (3.2.0.0)
+  - *api*: Versioned namespace to preserve ABI compatibility between minor releases [#4869](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4869) (3.2.0.0)
+* Color management improvements:
+  - Fix some legacy 'Linear' color references [#4959](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4959) (3.2.0.0)
+  - Auto convert between oiio:ColorSpace and CICP attributes in I/O [#4964](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4964) (by Brecht Van Lommel) (3.0.14.0, 3.2.0.0)
+  - *openexr*: Write OpenEXR colorInteropID metadata based on oiio:ColorSpace [#4967](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4967) (by Brecht Van Lommel) (3.0.14.0, 3.2.0.0)
+  - *jpeg-xl*: CICP read and write support for JPEG-XL [#4968](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4968) (by Brecht Van Lommel) (3.2.0.0, 3.1.9.0)
+  - *jpeg-xl*: ICC read and write for JPEG-XL files (issue 4649) [#4905](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4905) (by shanesmith-dwa) (3.0.14.0, 3.2.0.0)
 ### 🚀  Performance improvements
 ### 🐛  Fixes and feature enhancements
-  - *ffmpeg*: 10 bit video had wrong green channel [#4935](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4935) (by Brecht Van Lommel) (3.1.7.0, 3.2.0.0)
-  - *iff*: Handle non-zero origin, protect against buffer overflows [#4925](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4925) (3.1.7.0, 3.2.0.0)
+  - *IBA*: IBA::compare_Yee() accessed the wrong channel [#4976](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4976) (by Pavan Madduri) (3.2.0.0)
+  - *exif*: Support EXIF 3.0 tags [#4961](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4961) (3.2.0.0)
+  - *imagebuf*: Fix set_pixels bug, didn't consider roi = All [#4949](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4949) (3.2.0.0)
+  - *ffmpeg*: 10 bit video had wrong green channel [#4935](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4935) (by Brecht Van Lommel) (3.2.0.0, 3.1.7.0)
+  - *iff*: Handle non-zero origin, protect against buffer overflows [#4925](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4925) (3.2.0.0, 3.1.7.0)
+  - *jpeg*: Fix wrong pointers/crashing when decoding CMYK jpeg files [#4963](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4963) (3.2.0.0)
+  - *jpeg-2000*: Type warning in assertion in jpeg2000output.cpp [#4952](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4952) (3.2.0.0)
   - *jpeg-xl*: ICC read and write for JPEG-XL files (issue 4649) [#4905](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4905) (by shanesmith-dwa) (3.2.0.0)
-  - *jpeg-xl*: Correctly set Quality for JPEG XL [#4933](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4933) (3.1.7.0, 3.2.0.0)
-  - *openexr*: Support for idManifest and deepImageState (experimental) [#4877](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4877) (3.1.7.0, 3.2.0.0)
-  - *openexr*: ACES Container hint for exr outputs [#4907](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4907) (by Oktay Comu) (3.1.7.0, 3.2.0.0)
+  - *jpeg-xl*: Correctly set Quality for JPEG XL [#4933](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4933) (3.2.0.0, 3.1.7.0)
+  - *jpeg-xl*: CICP read and write support for JPEG XL [#4968](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4968) (by Brecht Van Lommel) (3.2.0.0, 3.1.9.0)
+  - *openexr*: Support for idManifest and deepImageState (experimental) [#4877](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4877) (3.2.0.0, 3.1.7.0)
+  - *openexr*: ACES Container hint for exr outputs [#4907](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4907) (by Oktay Comu) (3.2.0.0, 3.1.7.0)
+  - *openexr*: Write OpenEXR colorInteropID metadata based on oiio:ColorSpace [#4967](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4967) (by Brecht Van Lommel) (3.0.14.0, 3.2.0.0)
+  - *openexr*: Improve attribute translation rules [#4946](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4946) (3.2.0.0)
+  - *openexr*: ACES container writes colorInteropId instead of colorInteropID [#4966](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4966) (by Brecht Van Lommel) (3.2.0.0)
+  - *png*: We were not correctly suppressing hint metadata [#4983](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4983) (3.2.0.0)
+  - *sgi*: Implement RLE encoding support for output [#4990](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4990) (by Jesse Yurkovich) (3.2.0.0)
+  - *webp*: Allow out-of-order scanlines when writing webp [#4973](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4973) (by Pavan Madduri) (3.2.0.0)
 ### 🔧  Internals and developer goodies
+  - *filesystem.h*: Speedup to detect the existence of files on Windows [#4977](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4977) (by JacksonSun-adsk) (3.2.0.0)
 ### 🏗  Build/test/CI and platform ports
 * OIIO's CMake build system and scripts:
-    - *build*: Allow auto-build of just required packages by setting `OpenImageIO_BUILD_MISSING_DEPS` to `required`. [#4927](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4927) (3.1.7.0, 3.2.0.0)
-    - *build*: Make dependency report more clear about what was required [#4929](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4929) (3.1.7.0, 3.2.0.0)
+  - *build*: Allow auto-build of just required packages by setting `OpenImageIO_BUILD_MISSING_DEPS` to `required`. [#4927](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4927) (3.2.0.0, 3.1.7.0)
+  - *build*: Make dependency report more clear about what was required [#4929](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4929) (3.2.0.0, 3.1.7.0)
 * Dependency and platform support:
-    - *build/deps*: Additional auto-build capabilities for dependencies that are not found: GIF library [#4921](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4921) (by Valery Angelique), OpenJPEG [#4911](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4911) (by Danny Greenstein) (3.1.7.0, 3.2.0.0)
+  - *deps*: Additional auto-build capabilities for dependencies that are not found: GIF library [#4921](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4921) (by Valery Angelique), OpenJPEG [#4911](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4911) (by Danny Greenstein) (3.2.0.0, 3.1.7.0)
+  - *deps*: Disable LERC in libTIFF local build script [#4957](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4957) (by LI JI) (3.2.0.0, 3.1.8.0)
+  - *deps*: Test against libraw 0.21.5 [#4988](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4988) (3.2.0.0, 3.1.9.0)
 * Testing and Continuous integration (CI) systems:
-    - *ci*: Python wheel building improvements: use ccache [#4924](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4924) (by Larry Gritz), unbreak wheel release + other enhancements pt 1 [#4937](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4937) (by Zach Lewis) (3.1.7.0, 3.2.0.0)
-    - *ci*: Simplify ci workflow by using build-steps for old aswf containers, too [#4932](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4932) (3.1.7.0, 3.2.0.0)
-    - *ci*: We were not correctly setting fmt version from job options [#4939](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4939) (3.1.7.0, 3.2.0.0)
+  - *tests*: Image_span_test reduce benchmark load for debug and CI renders [#4951](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4951) (3.2.0.0, 3.1.8.0)
+  - *ci*: Python wheel building improvements: use ccache [#4924](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4924) (by Larry Gritz), unbreak wheel release + other enhancements pt 1 [#4937](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4937) (by Zach Lewis) (3.2.0.0, 3.1.7.0)
+  - *ci*: Simplify ci workflow by using build-steps for old aswf containers, too [#4932](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4932) (3.2.0.0, 3.1.7.0)
+  - *ci*: We were not correctly setting fmt version from job options [#4939](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4939) (3.2.0.0, 3.1.7.0)
+  - *ci*: Emergency fix change deprecated sonarqube action [#4969](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4969) (3.2.0.0)
+  - *ci*: Try python 3.13 to fix Mac breakage on CI [#4970](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4970) (3.2.0.0)
 ### 📚  Notable documentation changes
-  - *docs*: Update/correct explanation of "openexr:core" attribute, and typo fixes [#4943](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4943) (3.1.7.0, 3.2.0.0)
+  - *docs*: Update/correct explanation of "openexr:core" attribute, and typo fixes [#4943](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4943) (3.2.0.0, 3.1.7.0)
 ### 🏢  Project Administration
+  - *admin*: Minor rewording in the issue and PR templates [#4982](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4982) (3.2.0.0)
 ### 🤝  Contributors
 
 ---
@@ -40,7 +65,43 @@ Release 3.2 (target: Sept 2026?) -- compared to 3.1
 
 
-Release 3.1.7.0 (Nov 1, 2025) -- compared to 3.1.7.0
+Release 3.1.9.0 (Jan 1, 2026) -- compared to 3.1.8.0
+----------------------------------------------------
+  - Color management improvements:
+      - Auto convert between oiio:ColorSpace and CICP attributes in I/O [#4964](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4964) (by Brecht Van Lommel)
+      - *exr*: Write OpenEXR colorInteropID metadata based on oiio:ColorSpace [#4967](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4967) (by Brecht Van Lommel)
+      - *jpeg-xl*: CICP read and write support for JPEG-XL [#4968](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4968) (by Brecht Van Lommel)
+      - *jpeg-xl*: ICC read and write for JPEG-XL files (issue 4649) [#4905](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4905) (by shanesmith-dwa)
+  - *png*: We were not correctly suppressing hint metadata [#4983](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4983)
+  - *sgi*: Implement RLE encoding support for output [#4990](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4990) (by Jesse Yurkovich)
+  - *webp*: Allow out-of-order scanlines when writing webp [#4973](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4973) (by Pavan Madduri)
+  - *fix/IBA*: IBA::compare_Yee() accessed the wrong channel [#4976](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4976) (by Pavan Madduri)
+  - *perf/filesystem.h*: Speedup to detect the existence of files on Windows [#4977](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4977) (by JacksonSun-adsk)
+  - *ci*: Address tight disk space on GHA runners [#4974](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4974)
+  - *ci*: Optimize install_homebrew_deps by coalescing installs [#4975](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4975)
+  - *ci*: Build_Ptex.bash should build Ptex using C++17 [#4978](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4978)
+  - *ci*: Unbreak CI by adjusting Ubuntu installs [#4981](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4981)
+  - *ci*: Test against libraw 0.21.5 [#4988](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4988)
+  - *docs*: Fix missing docs for `OIIO:attribute()` and `OIIO::getattribute()` [#4987](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4987)
+
+
+Release 3.1.8.0 (Dec 1, 2025) -- compared to 3.1.7.0
+----------------------------------------------------
+  - *exif*: Support EXIF 3.0 tags [#4961](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4961)
+  - *jpeg*: Fix wrong pointers/crashing when decoding CMYK jpeg files [#4963](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4963)
+  - *openexr*: Improve attribute translation rules [#4946](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4946)
+  - *openexr*: ACES container writes colorInteropId instead of colorInteropID [#4966](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4966) (by Brecht Van Lommel)
+  - *color mgmt*: Fix some legacy 'Linear' color references [#4959](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4959)
+  - *imagebuf*: Fix `ImageBuf::set_pixels()` bug, didn't consider roi = All [#4949](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4949)
+  - *tests*: Image_span_test reduce benchmark load for debug and CI renders [#4951](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4951)
+  - *build*: Type warning in assertion in jpeg2000output.cpp [#4952](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4952)
+  - *build*: Disable LERC in libTIFF local build script [#4957](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4957) (by LI JI)
+  - *ci*: Fix broken ci, debug and static cases, bump some latest [#4954](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4954)
+  - *ci*: Unbreak icc/icx CI [#4958](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4958)
+  - *admin*: Update some license notices [#4955](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4955)
+
+
+Release 3.1.7.0 (Nov 1, 2025) -- compared to 3.1.6.1
 ----------------------------------------------------
   - *openexr*: Support for idManifest and deepImageState (experimental) [#4877](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4877) (3.1.7.0)
   - *openexr*: ACES Container hint for exr outputs [#4907](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4907) (by Oktay Comu) (3.1.7.0)
@@ -67,7 +128,7 @@ Release 3.1.6.2 (Oct 3, 2025) -- compared to 3.1.6.1
   - *oiioversion.h*: Restore definition of `OIIO_NAMESPACE_USING` macro [#4920](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4920)
 
 
-Release 3.1 (Oct 2 2025) -- compared to 3.0.x
+Release 3.1 (Oct 2, 2025) -- compared to 3.0.x
 -----------------------------------------------------
 - Beta 1: Aug 22, 2025
 - Beta 2: Sep 19, 2025
@@ -383,6 +444,21 @@ asterisk) had not previously contributed to the project.
 ---
 
 
+Release 3.0.14.0 (Jan 1, 2026) -- compared to 3.0.13.0
+-------------------------------------------------------
+  - *fix(IBA)*: IBA::compare_Yee() accessed the wrong channel [#4976](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4976) (by Pavan Madduri)
+  - *ci*: Test against libraw 0.21.5 [#4988](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4988)
+  - *ci*: Address tight disk space on GHA runners [#4974](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4974)
+
+
+Release 3.0.13.0 (Dec 1, 2025) -- compared to 3.0.12.0
+-------------------------------------------------------
+  - *exif*: Support EXIF 3.0 tags [#4961](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4961)
+  - *build*: Disable LERC in libTIFF local build script [#4957](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4957) (by LI JI)
+  - *ci*: Fix broken ci, debug and static cases, bump some latest [#4954](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4954)
+  - *ci*: Unbreak icc/icx CI [#4958](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4958)
+
+
 Release 3.0.12.0 (Nov 1, 2025) -- compared to 3.0.11.0
 -------------------------------------------------------
   - *iff*: Handle non-zero origin, protect against buffer overflows [#4925](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4925)

From ccf895ea1d66e18df949b4c58a183e2e60e70f32 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Mon, 5 Jan 2026 10:31:46 -0800
Subject: [PATCH 17/70] deps: libheif 1.21 support (#4992)

Starting with 1.21, libheif seems to change behavior: When no CICP
metadata is present, libheif now returns 2,2,2 (all unspecified) on
read. OIIO convention, though, is to not set the attribute if valid CICP
data is not in the file.

---------

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 INSTALL.md                     | 2 +-
 src/heif.imageio/heifinput.cpp | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index becea32e95..1bcc407d68 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -64,7 +64,7 @@ NEW or CHANGED MINIMUM dependencies since the last major release are **bold**.
      * giflib >= 5.0 (tested through 5.2.2)
  * If you want support for HEIF/HEIC or AVIF images:
      * libheif >= 1.11 (1.16 required for correct orientation support,
-       tested through 1.20.2)
+       tested through 1.21.1)
      * libheif must be built with an AV1 encoder/decoder for AVIF support.
  * If you want support for DICOM medical image files:
      * DCMTK >= 3.6.1 (tested through 3.6.9)
diff --git a/src/heif.imageio/heifinput.cpp b/src/heif.imageio/heifinput.cpp
index 349bcdb1d4..9e0230a86c 100644
--- a/src/heif.imageio/heifinput.cpp
+++ b/src/heif.imageio/heifinput.cpp
@@ -287,7 +287,14 @@ HeifInput::seek_subimage(int subimage, int miplevel)
             m_ihandle.get_raw_image_handle(), &nclx);
 
         if (nclx) {
-            if (err.code == heif_error_Ok) {
+            // When CICP metadata is not present in the file, libheif returns
+            // unspecified since v1.21. Ignore it then.
+            if (err.code == heif_error_Ok
+                && !(nclx->color_primaries == heif_color_primaries_unspecified
+                     && nclx->transfer_characteristics
+                            == heif_transfer_characteristic_unspecified
+                     && nclx->matrix_coefficients
+                            == heif_matrix_coefficients_unspecified)) {
                 const int cicp[4] = { int(nclx->color_primaries),
                                       int(nclx->transfer_characteristics),
                                       int(nclx->matrix_coefficients),

From c9bff4420bd36d51de18de2edc68319cd57eef1b Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Thu, 8 Jan 2026 11:47:37 -0800
Subject: [PATCH 18/70] perf: IBA::resample improvements to speed up 20x or
 more (#4993)

For IBA::resample() when bilinear interpolation is used, almost all of
the expense was due to its relying on ImageBuf::interppixel which is
simple but constructs a new ImageBuf::ConstIterator EVERY TIME, which is
very expensive.

Reimplement in a way that reuses a single iterator. This speeds up
IBA::resample by 20x or more typicaly.

Also refactor resample to pull the handling of deep images into a
separate helper function and out of the main inner loop. And add some
benchmarking.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebufalgo_test.cpp  |  36 +++++++
 src/libOpenImageIO/imagebufalgo_xform.cpp | 111 ++++++++++++++++++++++
 2 files changed, 147 insertions(+)

diff --git a/src/libOpenImageIO/imagebufalgo_test.cpp b/src/libOpenImageIO/imagebufalgo_test.cpp
index 226c80cc8c..ff1604757f 100644
--- a/src/libOpenImageIO/imagebufalgo_test.cpp
+++ b/src/libOpenImageIO/imagebufalgo_test.cpp
@@ -721,6 +721,41 @@ test_zover()
 
 
+// Test ImageBuf::resample
+void
+test_resample()
+{
+    std::cout << "test resample\n";
+
+    // Timing
+    Benchmarker bench;
+    bench.units(Benchmarker::Unit::ms);
+
+    ImageSpec spec_hd_rgba_f(1920, 1080, 4, TypeFloat);
+    ImageSpec spec_hd_rgba_u8(1920, 1080, 4, TypeUInt8);
+    ImageBuf buf_hd_rgba_f(spec_hd_rgba_f);
+    ImageBuf buf_hd_rgba_u8(spec_hd_rgba_u8);
+    float red_rgba[] = { 1.0, 0.0, 0.0, 1.0 };
+    ImageBufAlgo::fill(buf_hd_rgba_f, red_rgba);
+    ImageBufAlgo::fill(buf_hd_rgba_u8, red_rgba);
+    ImageBuf smallf(ImageSpec(1024, 512, 4, TypeFloat));
+    ImageBuf smallu8(ImageSpec(1024, 512, 4, TypeUInt8));
+    bench("  IBA::resample HD->1024x512 rgba f->f    interp   ",
+          [&]() { ImageBufAlgo::resample(smallf, buf_hd_rgba_f, true); });
+    bench("  IBA::resample HD->1024x512 rgba f->u8   interp   ",
+          [&]() { ImageBufAlgo::resample(smallu8, buf_hd_rgba_f, true); });
+    bench("  IBA::resample HD->1024x512 rgba u8->u8  interp   ",
+          [&]() { ImageBufAlgo::resample(smallu8, buf_hd_rgba_u8, true); });
+    bench("  IBA::resample HD->1024x512 rgba f->f   no interp ",
+          [&]() { ImageBufAlgo::resample(smallf, buf_hd_rgba_f, false); });
+    bench("  IBA::resample HD->1024x512 rgba f->u8  no interp ",
+          [&]() { ImageBufAlgo::resample(smallu8, buf_hd_rgba_f, false); });
+    bench("  IBA::resample HD->1024x512 rgba u8->u8 no interp ",
+          [&]() { ImageBufAlgo::resample(smallu8, buf_hd_rgba_u8, false); });
+}
+
+
+
 // Tests ImageBufAlgo::compare
 void
 test_compare()
@@ -1662,6 +1697,7 @@ main(int argc, char** argv)
     test_over(TypeFloat);
     test_over(TypeHalf);
     test_zover();
+    test_resample();
     test_compare();
     test_isConstantColor();
     test_isConstantChannel();
diff --git a/src/libOpenImageIO/imagebufalgo_xform.cpp b/src/libOpenImageIO/imagebufalgo_xform.cpp
index ca3fa3e960..3768dc6263 100644
--- a/src/libOpenImageIO/imagebufalgo_xform.cpp
+++ b/src/libOpenImageIO/imagebufalgo_xform.cpp
@@ -1070,6 +1070,35 @@ ImageBufAlgo::fit(const ImageBuf& src, KWArgs options, ROI roi, int nthreads)
 
 
+// This operates just like the internals of ImageBuf::interppixel(), but
+// reuses the provided iterator to avoid the overhead of constructing a new
+// one each time. This speeds it up by 20x! The iterator `it` must already be
+// associated with `img`, but it need not be positioned correctly.
+template<class T>
+static bool
+interppixel(const ImageBuf& img, ImageBuf::ConstIterator<T>& it, float x,
+            float y, span<float> pixel, ImageBuf::WrapMode wrap)
+{
+    int n             = std::min(int(pixel.size()), img.spec().nchannels);
+    float* localpixel = OIIO_ALLOCA(float, n * 4);
+    float* p[4]       = { localpixel, localpixel + n, localpixel + 2 * n,
+                          localpixel + 3 * n };
+    x -= 0.5f;
+    y -= 0.5f;
+    int xtexel, ytexel;
+    float xfrac, yfrac;
+    xfrac = floorfrac(x, &xtexel);
+    yfrac = floorfrac(y, &ytexel);
+    it.rerange(xtexel, xtexel + 2, ytexel, ytexel + 2, 0, 1, wrap);
+    for (int i = 0; i < 4; ++i, ++it)
+        for (int c = 0; c < n; ++c)
+            p[i][c] = it[c];  //NOSONAR
+    bilerp(p[0], p[1], p[2], p[3], xfrac, yfrac, n, pixel.data());
+    return true;
+}
+
+
+
 template<typename DSTTYPE, typename SRCTYPE>
 static bool
 resample_scalar(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
@@ -1412,6 +1441,88 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
 }
 
 
+static bool
+resample_deep(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+              int nthreads)
+{
+    // If it's deep, figure out the sample allocations first, because
+    // it's not thread-safe to do that simultaneously with copying the
+    // values.
+    const ImageSpec& srcspec(src.spec());
+    const ImageSpec& dstspec(dst.spec());
+    float srcfx          = srcspec.full_x;
+    float srcfy          = srcspec.full_y;
+    float srcfw          = srcspec.full_width;
+    float srcfh          = srcspec.full_height;
+    float dstpixelwidth  = 1.0f / dstspec.full_width;
+    float dstpixelheight = 1.0f / dstspec.full_height;
+    ImageBuf::ConstIterator<float> srcpel(src, roi);
+    ImageBuf::Iterator<float> dstpel(dst, roi);
+    for (; !dstpel.done(); ++dstpel, ++srcpel) {
+        float s   = (dstpel.x() - dstspec.full_x + 0.5f) * dstpixelwidth;
+        float t   = (dstpel.y() - dstspec.full_y + 0.5f) * dstpixelheight;
+        int src_y = ifloor(srcfy + t * srcfh);
+        int src_x = ifloor(srcfx + s * srcfw);
+        srcpel.pos(src_x, src_y, 0);
+        dstpel.set_deep_samples(srcpel.deep_samples());
+    }
+
+    OIIO_ASSERT(src.deep() == dst.deep());
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const ImageSpec& srcspec(src.spec());
+        const ImageSpec& dstspec(dst.spec());
+        int nchannels = src.nchannels();
+
+        // Local copies of the source image window, converted to float
+        float srcfx = srcspec.full_x;
+        float srcfy = srcspec.full_y;
+        float srcfw = srcspec.full_width;
+        float srcfh = srcspec.full_height;
+
+        float dstfx          = dstspec.full_x;
+        float dstfy          = dstspec.full_y;
+        float dstfw          = dstspec.full_width;
+        float dstfh          = dstspec.full_height;
+        float dstpixelwidth  = 1.0f / dstfw;
+        float dstpixelheight = 1.0f / dstfh;
+
+        ImageBuf::Iterator<float> out(dst, roi);
+        ImageBuf::ConstIterator<float> srcpel(src);
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            // s,t are NDC space
+            float t = (y - dstfy + 0.5f) * dstpixelheight;
+            // src_xf, src_xf are image space float coordinates
+            float src_yf = srcfy + t * srcfh;
+            // src_x, src_y are image space integer coordinates of the floor
+            int src_y = ifloor(src_yf);
+            for (int x = roi.xbegin; x < roi.xend; ++x, ++out) {
+                float s      = (x - dstfx + 0.5f) * dstpixelwidth;
+                float src_xf = srcfx + s * srcfw;
+                int src_x    = ifloor(src_xf);
+                srcpel.pos(src_x, src_y, 0);
+                int nsamps = srcpel.deep_samples();
+                OIIO_DASSERT(nsamps == out.deep_samples());
+                if (!nsamps || nsamps != out.deep_samples())
+                    continue;
+                for (int c = 0; c < nchannels; ++c) {
+                    if (dstspec.channelformat(c) == TypeDesc::UINT32)
+                        for (int samp = 0; samp < nsamps; ++samp)
+                            out.set_deep_value(c, samp,
+                                               srcpel.deep_value_uint(c, samp));
+                    else
+                        for (int samp = 0; samp < nsamps; ++samp)
+                            out.set_deep_value(c, samp,
+                                               srcpel.deep_value(c, samp));
+                }
+            }
+        }
+    });
+
+    return true;
+}
+
+
+
 bool
 ImageBufAlgo::resample(ImageBuf& dst, const ImageBuf& src, bool interpolate,
                        ROI roi, int nthreads)

From f078f0578b2e4260d5a0f143af6c83bb66d37c51 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Fri, 9 Jan 2026 12:33:27 -0800
Subject: [PATCH 19/70] ci/deps: Freetype adjustments (#4999)

* CI test vs the latest freetype 2.14.1
* Bump the version of freetype that we auto-build to the latest (from
2.13.2)
* Simplify BZip2 finding logic, switch to using targets

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml              | 8 ++++----
 src/build-scripts/build_Freetype.bash | 2 +-
 src/cmake/build_Freetype.cmake        | 6 +++---
 src/cmake/externalpackages.cmake      | 6 ++----
 src/ffmpeg.imageio/CMakeLists.txt     | 2 +-
 src/libOpenImageIO/CMakeLists.txt     | 2 +-
 6 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 31d7e647ea..232ce657ef 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -466,7 +466,7 @@ jobs:
                             PTEX_VERSION=v2.5.0
                             PUGIXML_VERSION=v1.15
                             WEBP_VERSION=v1.6.0
-                            FREETYPE_VERSION=VER-2-14-0
+                            FREETYPE_VERSION=VER-2-14-1
                             USE_OPENVDB=0
             # Ensure we are testing all the deps we think we are. We would
             # like this test to have minimal missing dependencies.
@@ -549,7 +549,7 @@ jobs:
                             PTEX_VERSION=v2.4.3
                             PUGIXML_VERSION=v1.15
                             WEBP_VERSION=v1.6.0
-                            FREETYPE_VERSION=VER-2-14-0
+                            FREETYPE_VERSION=VER-2-14-1
                             USE_OPENVDB=0
           - desc: Linux ARM latest releases clang18 C++20 py3.12 exr3.4 ocio2.4
             nametag: linux-arm-latest-releases-clang
@@ -570,7 +570,7 @@ jobs:
                             PTEX_VERSION=v2.4.3
                             PUGIXML_VERSION=v1.15
                             WEBP_VERSION=v1.6.0
-                            FREETYPE_VERSION=VER-2-14-0
+                            FREETYPE_VERSION=VER-2-14-1
                             USE_OPENVDB=0
 
 
@@ -683,7 +683,7 @@ jobs:
       # built. But we would like to add more dependencies and reduce this list
       # of exceptions in the future.
       required_deps: ${{ matrix.required_deps || 'all' }}
-      optional_deps: ${{ matrix.optional_deps || 'CUDAToolkit;DCMTK;FFmpeg;GIF;JXL;Libheif;LibRaw;Nuke;OpenCV;OpenGL;OpenJPEG;openjph;OpenCV;OpenVDB;Ptex;pystring;Qt5;Qt6;TBB;R3DSDK;${{matrix.optional_deps_append}}' }}
+      optional_deps: ${{ matrix.optional_deps || 'BZip2;CUDAToolkit;DCMTK;FFmpeg;GIF;JXL;Libheif;LibRaw;Nuke;OpenCV;OpenGL;OpenJPEG;openjph;OpenCV;OpenVDB;Ptex;pystring;Qt5;Qt6;TBB;R3DSDK;${{matrix.optional_deps_append}}' }}
     strategy:
       fail-fast: false
       matrix:
diff --git a/src/build-scripts/build_Freetype.bash b/src/build-scripts/build_Freetype.bash
index 41c74f5c20..cd27c00ddd 100755
--- a/src/build-scripts/build_Freetype.bash
+++ b/src/build-scripts/build_Freetype.bash
@@ -11,7 +11,7 @@ set -ex
 
 # Repo and branch/tag/commit of Freetype to download if we don't have it yet
 FREETYPE_REPO=${FREETYPE_REPO:=https://github.com/freetype/freetype.git}
-FREETYPE_VERSION=${FREETYPE_VERSION:=VER-2-13-3}
+FREETYPE_VERSION=${FREETYPE_VERSION:=VER-2-14-1}
 
 # Where to put Freetype repo source (default to the ext area)
 LOCAL_DEPS_DIR=${LOCAL_DEPS_DIR:=${PWD}/ext}
diff --git a/src/cmake/build_Freetype.cmake b/src/cmake/build_Freetype.cmake
index cd9becae46..a38e17151a 100644
--- a/src/cmake/build_Freetype.cmake
+++ b/src/cmake/build_Freetype.cmake
@@ -6,10 +6,10 @@
 # Freetype by hand!
 ######################################################################
 
-set_cache (Freetype_BUILD_VERSION 2.13.2 "Freetype version for local builds")
+set_cache (Freetype_BUILD_VERSION 2.14.1 "Freetype version for local builds")
 set (Freetype_GIT_REPOSITORY "https://github.com/freetype/freetype")
-set (Freetype_GIT_TAG "VER-2-13-2")
-set_cache (Freetype_BUILD_SHARED_LIBS  OFF
+set (Freetype_GIT_TAG "VER-2-14-1")
+set_cache (Freetype_BUILD_SHARED_LIBS ${LOCAL_BUILD_SHARED_LIBS_DEFAULT}
            DOC "Should a local Freetype build, if necessary, build shared libraries" ADVANCED)
 # We would prefer to build a static Freetype, but haven't figured out how to make
 # it all work with the static dependencies, it just makes things complicated
diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake
index 55874fdd90..d3e017cb4e 100644
--- a/src/cmake/externalpackages.cmake
+++ b/src/cmake/externalpackages.cmake
@@ -200,11 +200,9 @@ checked_find_package (R3DSDK NO_RECORD_NOTFOUND)  # RED camera
 set (NUKE_VERSION "7.0" CACHE STRING "Nuke version to target")
 checked_find_package (Nuke NO_RECORD_NOTFOUND)
 
-if (FFmpeg_FOUND OR FREETYPE_FOUND)
+if ((FFmpeg_FOUND OR FREETYPE_FOUND OR TARGET Freetype::Freetype)
+    AND NOT TARGET BZip2::BZip2)
     checked_find_package (BZip2)   # Used by ffmpeg and freetype
-    if (NOT BZIP2_FOUND)
-        set (BZIP2_LIBRARIES "")  # TODO: why does it break without this?
-    endif ()
 endif()
 
 
diff --git a/src/ffmpeg.imageio/CMakeLists.txt b/src/ffmpeg.imageio/CMakeLists.txt
index c84ef3c90a..ac76c21473 100644
--- a/src/ffmpeg.imageio/CMakeLists.txt
+++ b/src/ffmpeg.imageio/CMakeLists.txt
@@ -28,7 +28,7 @@ if (FFmpeg_FOUND)
     add_oiio_plugin (ffmpeginput.cpp
                      INCLUDE_DIRS ${FFMPEG_INCLUDES}
                      LINK_LIBRARIES ${FFMPEG_LIBRARIES}
-                                    ${BZIP2_LIBRARIES}
+                                    $<TARGET_NAME_IF_EXISTS:BZip2::BZip2>
                      DEFINITIONS "USE_FFMPEG"
                                  "-DOIIO_FFMPEG_VERSION=\"${FFMPEG_VERSION}\"")
 else()
diff --git a/src/libOpenImageIO/CMakeLists.txt b/src/libOpenImageIO/CMakeLists.txt
index d813606755..262c85f4dd 100644
--- a/src/libOpenImageIO/CMakeLists.txt
+++ b/src/libOpenImageIO/CMakeLists.txt
@@ -163,7 +163,7 @@ target_link_libraries (OpenImageIO
             $<TARGET_NAME_IF_EXISTS:pugixml::pugixml>
             $<TARGET_NAME_IF_EXISTS:TBB::tbb>
             $<TARGET_NAME_IF_EXISTS:Freetype::Freetype>
-            ${BZIP2_LIBRARIES}
+            $<TARGET_NAME_IF_EXISTS:BZip2::BZip2>
             ZLIB::ZLIB
             ${CMAKE_DL_LIBS}
         )

From 0141771a3531c5032c168a8da763c65f5b830748 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Fri, 9 Jan 2026 12:33:43 -0800
Subject: [PATCH 20/70] ci: Speed up macos15 intel variant by not installing Qt
 (#4998)

The Intel MacOS 15 CI testing is getting dicier... lots of times,
Homebrew doesn't have cached versions of updated packages, so it tries
to build from source, which takes forever. The big culprit today is Qt.
So, basically, just on this one CI job variant, don't ask it to install
Qt. If it's there, it's there. If not, just skip it. It's tested plenty
in other variants.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 232ce657ef..06800c60b4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -626,7 +626,7 @@ jobs:
             python_ver: "3.13"
             simd: sse4.2,avx2
             ctest_test_timeout: 1200
-            setenvs: export MACOSX_DEPLOYMENT_TARGET=12.0
+            setenvs: export MACOSX_DEPLOYMENT_TARGET=12.0 INSTALL_QT=0
             benchmark: 1
           - desc: MacOS-14-ARM aclang15/C++20/py3.13
             runner: macos-14

From 0a3547a35fa27e2648e963267c0186ebc4cce8a8 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Fri, 9 Jan 2026 14:49:05 -0800
Subject: [PATCH 21/70] ci: don't run non-wheel workflows when only
 pyproject.toml changes (#4997)

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/analysis.yml | 1 +
 .github/workflows/ci.yml       | 1 +
 .github/workflows/docs.yml     | 2 ++
 3 files changed, 4 insertions(+)

diff --git a/.github/workflows/analysis.yml b/.github/workflows/analysis.yml
index bdaa0f8c81..db23ee6f55 100644
--- a/.github/workflows/analysis.yml
+++ b/.github/workflows/analysis.yml
@@ -26,6 +26,7 @@ on:
       - '!**/scorecard.yml'
       - '!**/wheel.yml'
       - '!**.properties'
+      - '!pyproject.toml'
       - '!docs/**'
   # Run analysis on PRs only if the branch name indicates that the purpose of
   # the PR is related to the Sonar analysis. We don't run on every PR because
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 06800c60b4..54edda4ee3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -16,6 +16,7 @@ on:
       - '!**/scorecard.yml'
       - '!**/wheel.yml'
       - '!**.properties'
+      - '!pyproject.toml'
       - '!docs/**'
   pull_request:
     paths:
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index ffc13b91c6..4eb8445841 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -20,6 +20,7 @@ on:
       - '**/run.py'
       - 'src/build-scripts/**'
       - './*.md'
+      - 'pyproject.toml'
   pull_request:
     paths-ignore:
       - '**/ci.yml'
@@ -33,6 +34,7 @@ on:
       - '**/run.py'
       - 'src/build-scripts/**'
       - './*.md'
+      - 'pyproject.toml'
   schedule:
     # Full nightly build
     - cron: "0 8 * * *"

From a9e48fa5127caff689b4bb3c4e95754dda863fdd Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 9 Jan 2026 19:55:11 -0500
Subject: [PATCH 22/70] fix(build): Fix building on OpenBSD (#5001)

Fixes #5000

Signed-off-by: Brad Smith <brad@comstyle.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/include/OpenImageIO/typedesc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/OpenImageIO/typedesc.h b/src/include/OpenImageIO/typedesc.h
index 33cf913560..5c38ae6209 100644
--- a/src/include/OpenImageIO/typedesc.h
+++ b/src/include/OpenImageIO/typedesc.h
@@ -409,7 +409,7 @@ template<> struct BaseTypeFromC<uint64_t> { static constexpr TypeDesc::BASETYPE
 template<> struct BaseTypeFromC<const uint64_t> { static constexpr TypeDesc::BASETYPE value = TypeDesc::UINT64; };
 template<> struct BaseTypeFromC<int64_t> { static constexpr TypeDesc::BASETYPE value = TypeDesc::INT64; };
 template<> struct BaseTypeFromC<const int64_t> { static constexpr TypeDesc::BASETYPE value = TypeDesc::INT64; };
-#if defined(__GNUC__) && (ULONG_MAX == 0xffffffffffffffff) && !(defined(__APPLE__) && defined(__MACH__)) || defined(__NetBSD__)
+#if defined(__GNUC__) && (ULONG_MAX == 0xffffffffffffffff) && !(defined(__APPLE__) && defined(__MACH__)) && !defined(__OpenBSD__) || defined(__NetBSD__)
 // Some platforms consider int64_t and long long to be different types, even
 // though they are actually the same size.
 static_assert(!std::is_same_v<unsigned long long, uint64_t>);

From eb77fe931f094d3a41a17bf5e5e87e1844640da3 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 10 Jan 2026 09:46:21 -0800
Subject: [PATCH 23/70] admin: Refine PR template to give more visual
 separation (#4995)

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/PULL_REQUEST_TEMPLATE.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 536ef4915c..386b60c172 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,3 +1,8 @@
+
+
+
+------ :scissors: -------------------------------------------------------------------
+
 YOU MAY DELETE ALL OF THIS IF YOU ALREADY HAVE A DESCRIPTIVE COMMIT MESSAGE!
 
 This is just a template and set of reminders about what constitutes a good PR.

From 3b35e46c77f6f7b043f486528d47e69e57cb427e Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Tue, 13 Jan 2026 12:13:24 -0800
Subject: [PATCH 24/70] build: Fix HARDENING build options (#4996)

Due to typos in the option name in compiler.cmake (HARDENING vs
OIIO_HARDENING, oops), I think we were never really setting the intended
compiler flags. Fix that all up, and repair the other problems that it
revealed -- some compiler version and option combinations weren't happy
with each other, etc.

One notable change is in encode_iptc_iim_one_tag: I think I made it
safer by taking an early out for 0-sized data, and also it needed a
warning suppression when certain gcc hardening levels were used.

---------

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml    |  3 +-
 src/cmake/compiler.cmake    | 62 +++++++++++++++++++++++++------------
 src/libOpenImageIO/iptc.cpp | 22 +++++++------
 3 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 54edda4ee3..3a11a36779 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -494,7 +494,7 @@ jobs:
                             PTEX_VERSION=main
                             PUGIXML_VERSION=master
                             WEBP_VERSION=main
-                            OIIO_CMAKE_FLAGS="-DOIIO_HARDENING=2"
+                            OIIO_HARDENING=2
                             EXTRA_DEP_PACKAGES="python3.12-dev python3-numpy"
                             USE_OPENVDB=0
                             FREETYPE_VERSION=master
@@ -518,6 +518,7 @@ jobs:
                             PTEX_VERSION=v2.4.2
                             PUGIXML_VERSION=v1.14
                             WEBP_VERSION=v1.4.0
+                            OIIO_HARDENING=3
           - desc: clang18 C++17 avx2 exr3.1 ocio2.3
             nametag: linux-clang18
             runner: ubuntu-24.04
diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake
index da4e76da48..8dffb97d36 100644
--- a/src/cmake/compiler.cmake
+++ b/src/cmake/compiler.cmake
@@ -485,30 +485,34 @@ endif ()
 #      recommended default for optimized, shipping code.
 #  2 : enable features that trade off performance for security, recommended
 #      for debugging or deploying in security-sensitive environments.
-#  3 : enable features that have a significant performance impact, only
-#      recommended for debugging.
+#  3 : enable features that have a significant performance impact, to maximize
+#      finding bugs without regard to performance. Only recommended for
+#      debugging.
 #
 # Some documentation:
 # https://best.openssf.org/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C++.html
 # https://www.gnu.org/software/libc/manual/html_node/Source-Fortification.html
 # https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_macros.html
 # https://libcxx.llvm.org/Hardening.html
-#
+# https://www.productive-cpp.com/hardening-cpp-programs-stack-protector/
+# https://medium.com/@simontoth/daily-bit-e-of-c-hardened-mode-of-standard-library-implementations-18be2422c372
+# https://cheatsheetseries.owasp.org/cheatsheets/C-Based_Toolchain_Hardening_Cheat_Sheet.html
+
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    set (${PROJ_NAME}_HARDENING_DEFAULT 3)
+    set (${PROJ_NAME}_HARDENING_DEFAULT 2)
 else ()
     set (${PROJ_NAME}_HARDENING_DEFAULT 1)
 endif ()
 set_cache (${PROJ_NAME}_HARDENING ${${PROJ_NAME}_HARDENING_DEFAULT}
            "Turn on security hardening features 0, 1, 2, 3")
 # Implementation:
-if (HARDENING GREATER_EQUAL 1)
+add_compile_definitions (${PROJ_NAME}_HARDENING_DEFAULT=${${PROJ_NAME}_HARDENING})
+if (${PROJ_NAME}_HARDENING GREATER_EQUAL 1)
+    # Enable PIE and pie to build as position-independent executables and
+    # libraries, needed for address space randomization used by some kernels.
+    set (CMAKE_POSITION_INDEPENDENT_CODE ON)
     # Features that should not detectably affect performance
     if (COMPILER_IS_GCC_OR_ANY_CLANG)
-        # Enable PIE and pie to build as position-independent executables,
-        # needed for address space randomiztion used by some kernels.
-        add_compile_options (-fPIE -pie)
-        add_link_options (-fPIE -pie)
         # Protect against stack overwrites. Is allegedly not a performance
         # tradeoff.
         add_compile_options (-fstack-protector-strong)
@@ -516,21 +520,39 @@ if (HARDENING GREATER_EQUAL 1)
     endif ()
     # Defining _FORTIFY_SOURCE provides buffer overflow checks in modern gcc &
     # clang with some compiler-assisted deduction of buffer lengths) for the
-    # many C functions such as memcpy, strcpy, sprintf, etc. See:
-    add_compile_definitions (_FORTIFY_SOURCE=${${PROJ_NAME}_HARDENING})
+    # many C functions such as memcpy, strcpy, sprintf, etc. But it requires
+    # optimization, so we don't do it for debug builds.
+    if ((CMAKE_COMPILER_IS_CLANG OR (GCC_VERSION VERSION_GREATER_EQUAL 14))
+         AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+        add_compile_definitions (_FORTIFY_SOURCE=${${PROJ_NAME}_HARDENING})
+    endif ()
+endif ()
+if (${PROJ_NAME}_HARDENING EQUAL 1)
     # Setting _LIBCPP_HARDENING_MODE enables various hardening features in
     # clang/llvm's libc++ 18.0 and later.
-    add_compiler_definitions (_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_FAST)
-endif ()
-if (HARDENING GREATER_EQUAL 2)
+    if (OIIO_CLANG_VERSION VERSION_GREATER_EQUAL 18.0 OR OIIO_APPLE_CLANG_VERSION VERSION_GREATER_EQUAL 18.0)
+        add_compile_definitions (_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_FAST)
+    endif ()
+elseif (${PROJ_NAME}_HARDENING EQUAL 2)
     # Features that might impact performance measurably
-    add_compile_definitions (_GLIBCXX_ASSERTIONS)
-    add_compiler_definitions (_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_EXTENSIVE)
-endif ()
-if (HARDENING GREATER_EQUAL 3)
+    if (GCC_VERSION VERSION_GREATER_EQUAL 14)
+        # I've had trouble turning this on in older gcc
+        add_compile_definitions (_GLIBCXX_ASSERTIONS)
+    endif ()
+    if (OIIO_CLANG_VERSION VERSION_GREATER_EQUAL 18.0 OR OIIO_APPLE_CLANG_VERSION VERSION_GREATER_EQUAL 18.0)
+        add_compile_definitions (_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_EXTENSIVE)
+    endif ()
+elseif (${PROJ_NAME}_HARDENING EQUAL 3)
     # Debugging features that might impact performance significantly
-    add_compile_definitions (_GLIBCXX_DEBUG)
-    add_compiler_definitions (_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_DEBUG)
+    if (GCC_VERSION VERSION_GREATER_EQUAL 14)
+        # I've had trouble turning this on in older gcc
+        add_compile_definitions (_GLIBCXX_ASSERTIONS)
+        # N.B. _GLIBCXX_DEBUG changes ABI, so don't do this:
+        #   add_compile_definitions (_GLIBCXX_DEBUG)
+    endif ()
+    if (OIIO_CLANG_VERSION VERSION_GREATER_EQUAL 18.0 OR OIIO_APPLE_CLANG_VERSION VERSION_GREATER_EQUAL 18.0)
+        add_compile_definitions (_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_DEBUG)
+    endif ()
 endif ()
 
 
diff --git a/src/libOpenImageIO/iptc.cpp b/src/libOpenImageIO/iptc.cpp
index b8ee573059..eed82eeb45 100644
--- a/src/libOpenImageIO/iptc.cpp
+++ b/src/libOpenImageIO/iptc.cpp
@@ -181,17 +181,21 @@ decode_iptc_iim(const void* iptc, int length, ImageSpec& spec)
 static void
 encode_iptc_iim_one_tag(int tag, string_view data, std::vector<char>& iptc)
 {
-    OIIO_DASSERT(data != nullptr);
+    if (data.size() == 0)
+        return;
+    data = data.substr(0, 0xffff);  // Truncate to prevent 16 bit overflow
+    size_t tagsize = data.size();
     iptc.push_back((char)0x1c);
     iptc.push_back((char)0x02);
     iptc.push_back((char)tag);
-    if (data.size()) {
-        int tagsize = std::min(int(data.size()),
-                               0xffff - 1);  // Prevent 16 bit overflow
-        iptc.push_back((char)(tagsize >> 8));
-        iptc.push_back((char)(tagsize & 0xff));
-        iptc.insert(iptc.end(), data.data(), data.data() + tagsize);
-    }
+    iptc.push_back((char)(tagsize >> 8));
+    iptc.push_back((char)(tagsize & 0xff));
+    OIIO_PRAGMA_WARNING_PUSH
+    OIIO_GCC_ONLY_PRAGMA(GCC diagnostic ignored "-Wstringop-overflow")
+    // Suppress what I'm sure is a false positive warning when
+    // _GLIBCXX_ASSERTIONS is enabled.
+    iptc.insert(iptc.end(), data.begin(), data.end());
+    OIIO_PRAGMA_WARNING_POP
 }
 
 
@@ -208,7 +212,7 @@ encode_iptc_iim(const ImageSpec& spec, std::vector<char>& iptc)
                 std::string allvals = p->get_string(0);
                 std::vector<std::string> tokens;
                 Strutil::split(allvals, tokens, ";");
-                for (auto& token : tokens) {
+                for (auto token : tokens) {
                     token = Strutil::strip(token);
                     if (token.size()) {
                         if (iimtag[i].maxlen && iimtag[i].maxlen < token.size())

From a3305179b71149ee73d3e5f8a64d9d789057be76 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Wed, 14 Jan 2026 14:16:30 -0800
Subject: [PATCH 25/70] test: Add new ref image for jpeg test (#5007)

Needed for some systems after the changes of #4963.

Ever so slightly different LSB somewhere makes the hashes not match, but
it's a correct visual result.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 testsuite/jpeg/ref/out-jpeg9.4.txt | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 testsuite/jpeg/ref/out-jpeg9.4.txt

diff --git a/testsuite/jpeg/ref/out-jpeg9.4.txt b/testsuite/jpeg/ref/out-jpeg9.4.txt
new file mode 100644
index 0000000000..bc8035159e
--- /dev/null
+++ b/testsuite/jpeg/ref/out-jpeg9.4.txt
@@ -0,0 +1,9 @@
+Reading src/YCbCrK.jpg
+src/YCbCrK.jpg       :   52 x   52, 3 channel, uint8 jpeg
+    SHA-1: B54FAE77E27EFCEACF27BA796A48DCE6DF262F26
+    channel list: R, G, B
+    jpeg:ColorSpace: "YCbCrK"
+    jpeg:subsampling: "4:4:4"
+    oiio:ColorSpace: "srgb_rec709_scene"
+Comparing "rgb-from-YCbCrK.tif" and "ref/rgb-from-YCbCrK.tif"
+PASS

From 0a9bfefb2e853596542a47ebece883fa9f0c202a Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Thu, 15 Jan 2026 10:54:47 -0800
Subject: [PATCH 26/70] docs: Remove outdated/wrong description in INSTALL.md
 (#5008)

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 INSTALL.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 1bcc407d68..c822a01052 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -28,12 +28,10 @@ NEW or CHANGED MINIMUM dependencies since the last major release are **bold**.
    through 3.1)
  * zlib >= 1.2.7 (tested through 1.3.1)
  * [fmtlib](https://github.com/fmtlib/fmt) >= 7.0 (tested through 12.0 and master).
-   If not found at build time, this will be automatically downloaded unless
-   the build sets `-DBUILD_MISSING_FMT=OFF`.
+   If not found at build time, this will be automatically downloaded and built.
  * [Robin-map](https://github.com/Tessil/robin-map) (unknown minimum, tested
    through 1.4, which is the recommended version). If not found at build time,
-   this will be automatically downloaded unless the build sets
-   `-DBUILD_MISSING_FMT=OFF`.
+   this will be automatically downloaded and built.
 
 ### Optional dependencies -- features may be disabled if not found
  * If you are building the `iv` viewer (which will be disabled if any of

From 0e7c0660f1d914110d2f8c0b7447efbe9e5f0f7b Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 17 Jan 2026 08:41:09 -0800
Subject: [PATCH 27/70] ci: Windows runners switched which python version they
 had (#5010)

Yay, I love how these things just break with no notice.

Fixes CI that broke a couple days agi.

---------

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml                  | 4 ++--
 src/build-scripts/gh-win-installdeps.bash | 4 ++++
 src/cmake/testing.cmake                   | 1 +
 testsuite/runtest.py                      | 4 +---
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3a11a36779..4406149c73 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -695,13 +695,13 @@ jobs:
             nametag: windows-2022
             vsver: 2022
             generator: "Visual Studio 17 2022"
-            python_ver: "3.9"
+            python_ver: "3.12"
             setenvs: export OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH=1
           - desc: Windows-2025 VS2022
             runner: windows-2025
             nametag: windows-2025
             vsver: 2022
             generator: "Visual Studio 17 2022"
-            python_ver: "3.9"
+            python_ver: "3.12"
             setenvs: export OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH=1
             benchmark: 1
diff --git a/src/build-scripts/gh-win-installdeps.bash b/src/build-scripts/gh-win-installdeps.bash
index d4f0283853..48ad421924 100755
--- a/src/build-scripts/gh-win-installdeps.bash
+++ b/src/build-scripts/gh-win-installdeps.bash
@@ -32,6 +32,10 @@ elif [[ "$PYTHON_VERSION" == "3.9" ]] ; then
     export CMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH;/c/hostedtoolcache/windows/Python/3.9.13/x64"
     export Python_EXECUTABLE="/c/hostedtoolcache/windows/Python/3.9.13/x64/python3.exe"
     export PYTHONPATH=$OpenImageIO_ROOT/lib/python${PYTHON_VERSION}/site-packages
+elif [[ "$PYTHON_VERSION" == "3.12" ]] ; then
+    export CMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH;/c/hostedtoolcache/windows/Python/3.12.10/x64"
+    export Python_EXECUTABLE="/c/hostedtoolcache/windows/Python/3.12.10/x64/python3.exe"
+    export PYTHONPATH=$OpenImageIO_ROOT/lib/python${PYTHON_VERSION}/site-packages
 fi
 pip install numpy
 
diff --git a/src/cmake/testing.cmake b/src/cmake/testing.cmake
index 8bd975efbc..72a0cd8843 100644
--- a/src/cmake/testing.cmake
+++ b/src/cmake/testing.cmake
@@ -110,6 +110,7 @@ macro (oiio_add_tests)
                              "OIIO_TESTSUITE_ROOT=${_testsuite}"
                              "OIIO_TESTSUITE_SRC=${_testsrcdir}"
                              "OIIO_TESTSUITE_CUR=${_testdir}"
+                             "Python_EXECUTABLE=${Python3_EXECUTABLE}"
                              ${_ats_ENVIRONMENT})
             if (NOT ${_ats_testdir} STREQUAL "")
                 set_property(TEST ${_testname} APPEND PROPERTY ENVIRONMENT
diff --git a/testsuite/runtest.py b/testsuite/runtest.py
index 5efced462f..052b68434a 100755
--- a/testsuite/runtest.py
+++ b/testsuite/runtest.py
@@ -162,9 +162,7 @@ def newsymlink(src: str, dst: str):
 if os.getenv("Python_EXECUTABLE") :
     pythonbin = os.getenv("Python_EXECUTABLE")
 else :
-    pythonbin = 'python'
-    if os.getenv("PYTHON_VERSION") :
-        pythonbin += os.getenv("PYTHON_VERSION")
+    pythonbin = sys.executable
 #print ("pythonbin = ", pythonbin)
 
 
From e8ee38513c00dabe889516f964c707923994178a Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 17 Jan 2026 11:10:01 -0800
Subject: [PATCH 28/70] ci: test against libraw 0.22 for 'latest' test variants
 (#5009)

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml | 8 ++++----
 INSTALL.md               | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4406149c73..bb8b86f022 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -461,7 +461,7 @@ jobs:
             simd: avx2,f16c
             setenvs: export LIBJPEGTURBO_VERSION=3.1.2
                             LIBPNG_VERSION=v1.6.50
-                            LIBRAW_VERSION=0.21.5
+                            LIBRAW_VERSION=0.22.0
                             LIBTIFF_VERSION=v4.7.1
                             OPENJPEG_VERSION=v2.5.4
                             PTEX_VERSION=v2.5.0
@@ -514,7 +514,7 @@ jobs:
             simd: avx2,f16c
             setenvs: export OpenImageIO_BUILD_LOCAL_DEPS=all
                             OpenImageIO_DEPENDENCY_BUILD_VERBOSE=ON
-                            LIBRAW_VERSION=0.21.5
+                            LIBRAW_VERSION=0.22.0
                             PTEX_VERSION=v2.4.2
                             PUGIXML_VERSION=v1.14
                             WEBP_VERSION=v1.4.0
@@ -545,7 +545,7 @@ jobs:
             python_ver: "3.12"
             setenvs: export LIBJPEGTURBO_VERSION=3.1.2
                             LIBPNG_VERSION=v1.6.50
-                            LIBRAW_VERSION=0.21.5
+                            LIBRAW_VERSION=0.22.0
                             LIBTIFF_VERSION=v4.7.1
                             OPENJPEG_VERSION=v2.5.4
                             PTEX_VERSION=v2.4.3
@@ -566,7 +566,7 @@ jobs:
             python_ver: "3.12"
             setenvs: export LIBJPEGTURBO_VERSION=3.1.2
                             LIBPNG_VERSION=v1.6.50
-                            LIBRAW_VERSION=0.21.5
+                            LIBRAW_VERSION=0.22.0
                             LIBTIFF_VERSION=v4.7.1
                             OPENJPEG_VERSION=v2.5.4
                             PTEX_VERSION=v2.4.3
diff --git a/INSTALL.md b/INSTALL.md
index c822a01052..ee0d73eee8 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -45,7 +45,7 @@ NEW or CHANGED MINIMUM dependencies since the last major release are **bold**.
  * If you want support for PNG files:
      * libPNG >= 1.6.0 (tested though 1.6.50)
  * If you want support for camera "RAW" formats:
-     * LibRaw >= 0.20 (tested though 0.21.5 and master)
+     * LibRaw >= 0.20 (tested though 0.22.0 and master)
  * If you want support for a wide variety of video formats:
      * ffmpeg >= 4.0 (tested through 8.0)
  * If you want support for jpeg 2000 images:

From 8e39e3cba3f2830879973284d88fe160d697e417 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sun, 18 Jan 2026 09:24:10 -0800
Subject: [PATCH 29/70] fix: several bug fixes related to internal use of
 image_span (#5004)

* ImageBuf internal buffer span lacked correct chansize. The internal
`m_bufspan` is an `image_span<byte>`, and as such, it needs to remember
the size of the original data type. Otherwise, there's a cascade of
potential errors when it thinks that the individual values are byte
sized.

* In both ImageInput and ImageOutput, several sanity checks of
image_span size versus expectations were incorrect. They were only
checking if the total byte sizes matched expectations, but they are
allowed to disagree when you consider type conversions (in which case,
it's the total number of values that need to match, not the total byte
sizes.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebuf.cpp    |  9 +--
 src/libOpenImageIO/imageinput.cpp  | 89 +++++++++++++++---------------
 src/libOpenImageIO/imageoutput.cpp | 67 +++++++++++++---------
 3 files changed, 90 insertions(+), 75 deletions(-)

diff --git a/src/libOpenImageIO/imagebuf.cpp b/src/libOpenImageIO/imagebuf.cpp
index 8a5ed32e0a..403a22c545 100644
--- a/src/libOpenImageIO/imagebuf.cpp
+++ b/src/libOpenImageIO/imagebuf.cpp
@@ -150,10 +150,11 @@ class ImageBufImpl {
                      stride_t ystride = AutoStride,
                      stride_t zstride = AutoStride)
     {
-        m_bufspan = image_span(reinterpret_cast<std::byte*>(data),
-                               m_spec.nchannels, m_spec.width, m_spec.height,
-                               m_spec.depth, m_spec.format.size(), xstride,
-                               ystride, zstride);
+        auto formatsize = m_spec.format.size();
+        m_bufspan       = image_span(reinterpret_cast<std::byte*>(data),
+                                     m_spec.nchannels, m_spec.width, m_spec.height,
+                                     m_spec.depth, formatsize, xstride, ystride,
+                                     zstride, formatsize);
     }
 
     bool init_spec(string_view filename, int subimage, int miplevel,
diff --git a/src/libOpenImageIO/imageinput.cpp b/src/libOpenImageIO/imageinput.cpp
index 03af1297c5..3ff8f1a255 100644
--- a/src/libOpenImageIO/imageinput.cpp
+++ b/src/libOpenImageIO/imageinput.cpp
@@ -211,6 +211,40 @@ ImageInput::spec_dimensions(int subimage, int miplevel)
 
 
+// Utility: Make sure the provided data span is the right size for the
+// image described by spec and datatype. If they don't match, issue an
+// error and return false.
+static bool
+check_span_size(ImageInput* in, string_view caller, const ImageSpec& spec,
+                TypeDesc datatype, imagesize_t npixels, int chbegin, int chend,
+                const image_span<std::byte>& data)
+{
+    // One of two things must be correct: Either format is Unknown and the
+    // total byte size needs to match the "native" size, or the format is
+    // concrete and the number of value must match (it's ok if the size
+    // doesn't match, since a data type conversion will occur).
+    if (datatype.is_unknown()) {  // Unknown assumes native chan types
+        size_t sz = npixels * spec.pixel_bytes(chbegin, chend, true);
+        if (sz != data.size_bytes()) {
+            in->errorfmt(
+                "{}: image_span size is incorrect ({} bytes vs {} needed)",
+                caller, data.size_bytes(), sz);
+            return false;
+        }
+    } else {  // single concrete type
+        size_t nvals = npixels * size_t(chend - chbegin);
+        if (nvals != data.nvalues()) {
+            in->errorfmt(
+                "{}: image_span size is incorrect ({} values vs {} needed)",
+                caller, data.nvalues(), nvals);
+            return false;
+        }
+    }
+    return true;
+}
+
+
+
 bool
 ImageInput::read_scanline(int y, int z, TypeDesc format, void* data,
                           stride_t xstride)
@@ -300,16 +334,10 @@ ImageInput::read_scanlines(int subimage, int miplevel, int ybegin, int yend,
                  chend);
         return false;
     }
-    size_t isize = (format == TypeUnknown
-                        ? spec.pixel_bytes(chbegin, chend, true /*native*/)
-                        : format.size() * (chend - chbegin))
-                   * size_t(spec.width);
-    if (isize != data.size_bytes()) {
-        errorfmt(
-            "read_scanlines: Buffer size is incorrect ({} bytes vs {} needed)",
-            isize, data.size_bytes());
+    if (!check_span_size(this, "read_scanlines", m_spec, format,
+                         m_spec.width * size_t(yend - ybegin), chbegin, chend,
+                         data))
         return false;
-    }
 
     // Default implementation (for now): call the old pointer+stride
     return read_scanlines(subimage, miplevel, ybegin, yend, 0, chbegin, chend,
@@ -656,16 +684,11 @@ ImageInput::read_tiles(int subimage, int miplevel, int xbegin, int xend,
         errorfmt("read_tiles: invalid channel range [{},{})", chbegin, chend);
         return false;
     }
-    size_t isize = (format == TypeUnknown
-                        ? spec.pixel_bytes(chbegin, chend, true /*native*/)
-                        : format.size() * (chend - chbegin))
-                   * size_t(xend - xbegin) * size_t(yend - ybegin)
-                   * size_t(zend - zbegin);
-    if (isize != data.size_bytes()) {
-        errorfmt("read_tiles: Buffer size is incorrect ({} bytes vs {} needed)",
-                 isize, data.size_bytes());
+    if (!check_span_size(this, "read_tiles", m_spec, format,
+                         size_t(xend - xbegin) * size_t(yend - ybegin)
+                             * size_t(zend - zbegin),
+                         chbegin, chend, data))
         return false;
-    }
 
     // Default implementation (for now): call the old pointer+stride
     return read_tiles(subimage, miplevel, ybegin, yend, xbegin, xend, zbegin,
@@ -1164,24 +1187,6 @@ bool
 ImageInput::read_image(int subimage, int miplevel, int chbegin, int chend,
                        TypeDesc format, const image_span<std::byte>& data)
 {
-#if 0
-    ImageSpec spec = spec_dimensions(subimage, miplevel);
-    if (chend < 0 || chend > spec.nchannels)
-        chend = spec.nchannels;
-    size_t isize = (format == TypeUnknown
-                        ? spec.pixel_bytes(chbegin, chend, true /*native*/)
-                        : format.size() * (chend - chbegin))
-                   * spec.image_pixels();
-    if (isize != data.size_bytes()) {
-        errorfmt("read_image: Buffer size is incorrect ({} bytes vs {} needed)",
-                 sz, data.size_bytes());
-        return false;
-    }
-
-    // Default implementation (for now): call the old pointer+stride
-    return read_image(subimage, miplevel, chbegin, chend, format, data.data(),
-                      data.xstride(), data.ystride(), data.zstride());
-#else
     OIIO::pvt::LoggedTimer logtime("II::read_image");
     ImageSpec spec;
     int rps = 0;
@@ -1210,16 +1215,9 @@ ImageInput::read_image(int subimage, int miplevel, int chbegin, int chend,
         errorfmt("read_image: invalid channel range [{},{})", chbegin, chend);
         return false;
     }
-    int nchans         = chend - chbegin;
-    bool native        = (format == TypeUnknown);
-    size_t pixel_bytes = native ? spec.pixel_bytes(chbegin, chend, native)
-                                : (format.size() * nchans);
-    size_t isize       = pixel_bytes * spec.image_pixels();
-    if (isize != data.size_bytes()) {
-        errorfmt("read_image: Buffer size is incorrect ({} bytes vs {} needed)",
-                 isize, data.size_bytes());
+    if (!check_span_size(this, "read_image", m_spec, format,
+                         spec.image_pixels(), chbegin, chend, data))
         return false;
-    }
 
     bool ok = true;
     if (spec.tile_width) {  // Tiled image -- rely on read_tiles
@@ -1259,7 +1257,6 @@ ImageInput::read_image(int subimage, int miplevel, int chbegin, int chend,
         }
     }
     return ok;
-#endif
 }
 
 
diff --git a/src/libOpenImageIO/imageoutput.cpp b/src/libOpenImageIO/imageoutput.cpp
index 1b9edaede5..216c2a18f9 100644
--- a/src/libOpenImageIO/imageoutput.cpp
+++ b/src/libOpenImageIO/imageoutput.cpp
@@ -102,6 +102,40 @@ ImageOutput::~ImageOutput()
 
 
+// Utility: Make sure the provided data span is the right size for the
+// image described by spec and datatype. If they don't match, issue an
+// error and return false.
+static bool
+check_span_size(ImageOutput* out, string_view caller, const ImageSpec& spec,
+                TypeDesc datatype, imagesize_t npixels,
+                const image_span<const std::byte>& data)
+{
+    // One of two things must be correct: Either format is Unknown and the
+    // total byte size needs to match the "native" size, or the format is
+    // concrete and the number of value must match (it's ok if the size
+    // doesn't match, since a data type conversion will occur).
+    if (datatype.is_unknown()) {  // Unknown assumes native chan types
+        size_t sz = npixels * spec.pixel_bytes(true);
+        if (sz != data.size_bytes()) {
+            out->errorfmt(
+                "{}: image_span size is incorrect ({} bytes vs {} needed)",
+                caller, data.size_bytes(), sz);
+            return false;
+        }
+    } else {  // single concrete type
+        size_t nvals = npixels * size_t(spec.nchannels);
+        if (nvals != data.nvalues()) {
+            out->errorfmt(
+                "{}: image_span size is incorrect ({} values vs {} needed)",
+                caller, data.nvalues(), nvals);
+            return false;
+        }
+    }
+    return true;
+}
+
+
+
 bool
 ImageOutput::write_scanline(int /*y*/, int /*z*/, TypeDesc /*format*/,
                             const void* /*data*/, stride_t /*xstride*/)
@@ -120,13 +154,9 @@ ImageOutput::write_scanline(int y, TypeDesc format,
         errorfmt("write_scanlines: Invalid scanline index {}", y);
         return false;
     }
-    size_t sz = m_spec.scanline_bytes(format);
-    if (sz != data.size_bytes()) {
-        errorfmt(
-            "write_scanline: Buffer size is incorrect ({} bytes vs {} needed)",
-            sz, data.size_bytes());
+    if (!check_span_size(this, "write_scanline", m_spec, format, m_spec.width,
+                         data))
         return false;
-    }
 
     // Default implementation (for now): call the old pointer+stride
     return write_scanline(y, 0, format, data.data(), data.xstride());
@@ -164,13 +194,9 @@ ImageOutput::write_scanlines(int ybegin, int yend, TypeDesc format,
         errorfmt("write_scanlines: Invalid scanline range {}-{}", ybegin, yend);
         return false;
     }
-    size_t sz = m_spec.scanline_bytes(format) * size_t(yend - ybegin);
-    if (sz != data.size_bytes()) {
-        errorfmt(
-            "write_scanlines: Buffer size is incorrect ({} bytes vs {} needed)",
-            sz, data.size_bytes());
+    if (!check_span_size(this, "write_scanlines", m_spec, format,
+                         m_spec.width * size_t(yend - ybegin), data))
         return false;
-    }
 
     // Default implementation (for now): call the old pointer+stride
     return write_scanlines(ybegin, yend, 0, format, data.data(), data.xstride(),
@@ -194,15 +220,9 @@ bool
 ImageOutput::write_tile(int x, int y, int z, TypeDesc format,
                         const image_span<const std::byte>& data)
 {
-    size_t sz = format == TypeUnknown
-                    ? m_spec.pixel_bytes(true /*native*/)
-                    : m_spec.tile_pixels() * size_t(m_spec.nchannels)
-                          * format.size();
-    if (sz != data.size_bytes()) {
-        errorfmt("write_tile: Buffer size is incorrect ({} bytes vs {} needed)",
-                 sz, data.size_bytes());
+    if (!check_span_size(this, "write_tile", m_spec, format,
+                         m_spec.tile_pixels(), data))
         return false;
-    }
 
     // Default implementation (for now): call the old pointer+stride
     return write_tile(x, y, z, format, data.data(), data.xstride(),
@@ -691,12 +711,9 @@ bool
 ImageOutput::write_image(TypeDesc format,
                          const image_span<const std::byte>& data)
 {
-    size_t sz = m_spec.image_bytes(/*native=*/format == TypeUnknown);
-    if (sz != data.size_bytes()) {
-        errorfmt("write_image: Buffer size is incorrect ({} bytes vs {} needed)",
-                 sz, data.size_bytes());
+    if (!check_span_size(this, "write_image", m_spec, format,
+                         m_spec.image_pixels(), data))
         return false;
-    }
 
     // Default implementation (for now): call the old pointer+stride
     return write_image(format, data.data(), data.xstride(), data.ystride(),

From 71e2949038913a04a8d99c63f3c4160fc424b499 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Tue, 20 Jan 2026 17:29:48 -0800
Subject: [PATCH 30/70] build: Use libheif exported config if available (#5012)

Needed to change the Libheif::Libheif target we set up ages ago to the
"heif" name that the package's exported config uses.

Also, need to give the warning about static libraries occur any time we
are using static libheif, even if we didn't set LIBSTATIC to try to
force all static libraries.

---------

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/cmake/externalpackages.cmake    |  1 +
 src/cmake/modules/FindLibheif.cmake | 10 ++++----
 src/heif.imageio/CMakeLists.txt     | 38 +++++++++++++----------------
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake
index d3e017cb4e..ed267c4bdf 100644
--- a/src/cmake/externalpackages.cmake
+++ b/src/cmake/externalpackages.cmake
@@ -163,6 +163,7 @@ checked_find_package (GIF VERSION_MIN 5.0)
 
 # For HEIF/HEIC/AVIF formats
 checked_find_package (Libheif VERSION_MIN 1.11
+                      PREFER_CONFIG
                       RECOMMEND_MIN 1.16
                       RECOMMEND_MIN_REASON "for orientation support")
 
diff --git a/src/cmake/modules/FindLibheif.cmake b/src/cmake/modules/FindLibheif.cmake
index 5d061af6ea..61c93feac7 100644
--- a/src/cmake/modules/FindLibheif.cmake
+++ b/src/cmake/modules/FindLibheif.cmake
@@ -27,7 +27,7 @@ find_library (LIBHEIF_LIBRARY heif
               HINTS
                   ${LIBHEIF_LIBRARY_PATH}
                   ENV LIBHEIF_LIBRARY_PATH
-              DOC "The directory where libheif libraries reside")
+              DOC "The found libheif library")
 
 if (LIBHEIF_INCLUDE_DIR)
     file(STRINGS "${LIBHEIF_INCLUDE_DIR}/libheif/heif_version.h" TMP REGEX "^#define LIBHEIF_VERSION[ \t].*$")
@@ -44,11 +44,11 @@ if (Libheif_FOUND)
     set(LIBHEIF_INCLUDES "${LIBHEIF_INCLUDE_DIR}")
     set(LIBHEIF_LIBRARIES "${LIBHEIF_LIBRARY}")
 
-    if (NOT TARGET Libheif::Libheif)
-        add_library(Libheif::Libheif UNKNOWN IMPORTED)
-        set_target_properties(Libheif::Libheif PROPERTIES
+    if (NOT TARGET heif)
+        add_library(heif UNKNOWN IMPORTED)
+        set_target_properties(heif PROPERTIES
             INTERFACE_INCLUDE_DIRECTORIES "${LIBHEIF_INCLUDES}")
-        set_property(TARGET Libheif::Libheif APPEND PROPERTY
+        set_property(TARGET heif APPEND PROPERTY
             IMPORTED_LOCATION "${LIBHEIF_LIBRARIES}")
     endif ()
 endif()
diff --git a/src/heif.imageio/CMakeLists.txt b/src/heif.imageio/CMakeLists.txt
index 25606a1391..c79b544035 100644
--- a/src/heif.imageio/CMakeLists.txt
+++ b/src/heif.imageio/CMakeLists.txt
@@ -3,31 +3,27 @@
 # https://github.com/AcademySoftwareFoundation/OpenImageIO
 
 if (Libheif_FOUND)
-    if (LINKSTATIC)
-        set (_static_suffixes .lib .a)    
-        set (_static_libraries_found 0)
-
-        foreach (_libeheif_library IN LISTS LIBHEIF_LIBRARIES)
-            get_filename_component (_ext ${_libeheif_library} LAST_EXT)
-            list (FIND _static_suffixes ${_ext} _index)
-            if (${_index} GREATER -1)
-                MATH (EXPR _static_libraries_found "${static_libraries_found}+1")
-            endif()
-        endforeach()
-
-        if (${_static_libraries_found} GREATER 0)
-            message (STATUS "${ColorYellow}")
-            message (STATUS "You are linking OpenImageIO against a static version of libheif, which is LGPL")
-            message (STATUS "licensed. If you intend to redistribute this build of OpenImageIO, we recommend")
-            message (STATUS "that you review the libheif license terms, or you may wish to switch to using a")
-            message (STATUS "dynamically-linked libheif.")
-            message ("${ColorReset}")
+    # Some extra care is needed if the libheif we found was static
+    set (_static_suffixes .lib .a)
+    set (_static_libraries_found 0)
+    foreach (_libeheif_library IN LISTS LIBHEIF_LIBRARIES)
+        get_filename_component (_ext ${_libeheif_library} LAST_EXT)
+        list (FIND _static_suffixes ${_ext} _index)
+        if (${_index} GREATER -1)
+            MATH (EXPR _static_libraries_found "${_static_libraries_found}+1")
         endif()
+    endforeach()
+    if (${_static_libraries_found} GREATER 0)
+        message (STATUS "${ColorYellow}")
+        message (STATUS "You are linking OpenImageIO against a static version of libheif, which is LGPL")
+        message (STATUS "licensed. If you intend to redistribute this build of OpenImageIO, we recommend")
+        message (STATUS "that you review the libheif license terms, or you may wish to switch to using a")
+        message (STATUS "dynamically-linked libheif.")
+        message ("${ColorReset}")
     endif()
 
     add_oiio_plugin (heifinput.cpp heifoutput.cpp
-                     INCLUDE_DIRS ${LIBHEIF_INCLUDES}
-                     LINK_LIBRARIES ${LIBHEIF_LIBRARIES}
+                     LINK_LIBRARIES heif
                      DEFINITIONS "USE_HEIF=1")
 else ()
     message (WARNING "heif plugin will not be built")

From 06c2c4218a42471a6b89b41f1a930f135fda3266 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Thu, 22 Jan 2026 17:19:22 -0800
Subject: [PATCH 31/70] build: fully disable tests when their required
 dependencies are missing (#5005)

We previously merely *marked* tests as "broken" in this case, and
depended on ctest being launched with `-E broken` to ensure those were
skipped. Not everybody did. Let's just truly skip them.

Fixes #4979

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 Makefile                | 2 +-
 src/cmake/testing.cmake | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 830e39a208..115a56eb68 100644
--- a/Makefile
+++ b/Makefile
@@ -293,7 +293,7 @@ test: build
 	@ ${CMAKE} -E cmake_echo_color --switch=$(COLOR) --cyan "Running tests ${TEST_FLAGS}..."
 	@ ( cd ${build_dir} ; \
 	    PYTHONPATH=${PWD}/${build_dir}/lib/python/site-packages \
-	    ctest -E broken ${TEST_FLAGS} \
+	    ctest ${TEST_FLAGS} \
 	  )
 	@ ( if [[ "${CODECOV}" == "1" ]] ; then \
 	      cd ${build_dir} ; \
diff --git a/src/cmake/testing.cmake b/src/cmake/testing.cmake
index 72a0cd8843..bfa588ca7b 100644
--- a/src/cmake/testing.cmake
+++ b/src/cmake/testing.cmake
@@ -47,7 +47,7 @@ set(OIIO_TESTSUITE_IMAGEDIR "${PROJECT_BINARY_DIR}/testsuite" CACHE PATH
 #
 # The optional SUFFIX is appended to the test name.
 #
-# The optinonal ENVIRONMENT is a list of environment variables to set for the
+# The optional ENVIRONMENT is a list of environment variables to set for the
 # test.
 #
 macro (oiio_add_tests)
@@ -56,9 +56,12 @@ macro (oiio_add_tests)
     set (_ats_testdir "${OIIO_TESTSUITE_IMAGEDIR}/${_ats_IMAGEDIR}")
     # If there was a FOUNDVAR param specified and that variable name is
     # not defined, mark the test as broken.
+    set (_test_disabled FALSE)
+    set (_test_notfound FALSE)
     foreach (_var ${_ats_FOUNDVAR})
         if (NOT ${_var})
             set (_ats_LABEL "broken")
+            set (_test_notfound TRUE)
         endif ()
     endforeach ()
     set (_test_disabled 0)
@@ -66,7 +69,7 @@ macro (oiio_add_tests)
         if ((NOT "${${_var}}" STREQUAL "" AND NOT "${${_var}}") OR
             (NOT "$ENV{${_var}}" STREQUAL "" AND NOT "$ENV{${_var}}"))
             set (_ats_LABEL "broken")
-            set (_test_disabled 1)
+            set (_test_disabled TRUE)
         endif ()
     endforeach ()
     # For OCIO 2.2+, have the testsuite use the default built-in config
@@ -74,6 +77,8 @@ macro (oiio_add_tests)
                                   "OIIO_TESTSUITE_OCIOCONFIG=ocio://default")
     if (_test_disabled)
         message (STATUS "Skipping test(s) ${_ats_UNPARSED_ARGUMENTS} because of disabled ${_ats_ENABLEVAR}")
+    elseif (_test_notfound)
+        message (STATUS "Skipping test(s) ${_ats_UNPARSED_ARGUMENTS} because of missing dependency from ${_ats_FOUNDVAR}")
     elseif (_ats_IMAGEDIR AND NOT EXISTS ${_ats_testdir})
         # If the directory containing reference data (images) for the test
         # isn't found, point the user at the URL.

From 218e66476bcb62661ce1c3fa00ab49d582a06d40 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Fri, 23 Jan 2026 20:37:58 +0100
Subject: [PATCH 32/70] fix(heif): Can not output AVIF when libheif has no HEVC
 support (#5013)

Initializing the encoder to `heif_compression_HEVC` by default throws an
exception when libheif was built without HEVC support, which prevents it
from being used entirely even if there is AVIF support.

So delay that initialization until we are sure which encoding we want.

Not really any practical way to automatically test this.

Signed-off-by: Brecht Van Lommel <brecht@blender.org>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/heif.imageio/heifoutput.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/heif.imageio/heifoutput.cpp b/src/heif.imageio/heifoutput.cpp
index 9998ab5a5f..309bc53ac5 100644
--- a/src/heif.imageio/heifoutput.cpp
+++ b/src/heif.imageio/heifoutput.cpp
@@ -49,7 +49,9 @@ class HeifOutput final : public ImageOutput {
     std::unique_ptr<heif::Context> m_ctx;
     heif::ImageHandle m_ihandle;
     heif::Image m_himage;
-    heif::Encoder m_encoder { heif_compression_HEVC };
+    // Undefined until we know the specific requested encoder, because an
+    // exception is thrown if libheif is built without support for it.
+    heif::Encoder m_encoder { heif_compression_undefined };
     std::vector<unsigned char> scratch;
     std::vector<unsigned char> m_tilebuffer;
     int m_bitdepth = 0;
@@ -140,12 +142,13 @@ HeifOutput::open(const std::string& name, const ImageSpec& newspec,
         m_himage.add_plane(heif_channel_interleaved, newspec.width,
                            newspec.height, m_bitdepth);
 
-        m_encoder      = heif::Encoder(heif_compression_HEVC);
         auto compqual  = m_spec.decode_compression_metadata("", 75);
         auto extension = Filesystem::extension(m_filename);
         if (compqual.first == "avif"
             || (extension == ".avif" && compqual.first == "")) {
             m_encoder = heif::Encoder(heif_compression_AV1);
+        } else {
+            m_encoder = heif::Encoder(heif_compression_HEVC);
         }
     } catch (const heif::Error& err) {
         std::string e = err.get_message();

From ae8feecb6db656d7c29889d96f26e37f0863e0d1 Mon Sep 17 00:00:00 2001
From: Jesse Yurkovich <jesse.y@gmail.com>
Date: Tue, 27 Jan 2026 20:24:47 -0800
Subject: [PATCH 33/70] fix(webp): Use correct resolution limits for
 WebpOutput::open (#5016)

The WebP format is very limited in the maximum resolution it supports.
We need to change the values we allow to be much smaller.

Signed-off-by: Jesse Yurkovich <jesse.y@gmail.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/webp.imageio/webpoutput.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webp.imageio/webpoutput.cpp b/src/webp.imageio/webpoutput.cpp
index 2c2acb85cc..784575963e 100644
--- a/src/webp.imageio/webpoutput.cpp
+++ b/src/webp.imageio/webpoutput.cpp
@@ -73,7 +73,7 @@ WebpImageWriter(const uint8_t* img_data, size_t data_size,
 bool
 WebpOutput::open(const std::string& name, const ImageSpec& spec, OpenMode mode)
 {
-    if (!check_open(mode, spec, { 0, 1 << 20, 0, 1 << 20, 0, 1, 0, 4 },
+    if (!check_open(mode, spec, { 0, 16383, 0, 16383, 0, 1, 0, 4 },
                     uint64_t(OpenChecks::Disallow1or2Channel)))
         return false;
 

From 2e2ef42ed152c208f40712f58317124c8f8328f8 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Wed, 28 Jan 2026 04:09:23 -0800
Subject: [PATCH 34/70] api: IBA::make_texture now honors "maketx:threads" hint
 (#5014)

Nearly all IBA functions take an optional parameter controlling the
threading. But make_texture() did not, so there was no way to control
its thread usage (it would always use the default number of threads).

Without changing the call signature or ABI, this patch merely makes
make_texture honor any "maketx:threads" hint passed in the config
ImageSpec that it already takes to convey all sorts of controls over the
texture creation process. Then this value is passed to any IBA
functions, use of parallel_image, and anything else in the
implementation of make_texture that would end up using the thread pool.

Fixes #4254

---------

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/include/OpenImageIO/imagebufalgo.h |   1 +
 src/libOpenImageIO/maketexture.cpp     | 112 ++++++++++++++-----------
 2 files changed, 65 insertions(+), 48 deletions(-)

diff --git a/src/include/OpenImageIO/imagebufalgo.h b/src/include/OpenImageIO/imagebufalgo.h
index 4e9c80fcdc..4bce5c1b6e 100644
--- a/src/include/OpenImageIO/imagebufalgo.h
+++ b/src/include/OpenImageIO/imagebufalgo.h
@@ -2247,6 +2247,7 @@ enum MakeTextureMode {
 ///                                 the coordinates for normal maps. ("")
 ///    - `maketx:verbose` (int) :   How much detail should go to outstream (0).
 ///    - `maketx:runstats` (int) :  If nonzero, print run stats to outstream (0).
+///    - `maketx:threads` (int) :   Number of threads to use (0 = auto).
 ///    - `maketx:resize` (int) :    If nonzero, resize to power of 2. (0)
 ///    - `maketx:keepaspect` (int): If nonzero, save aspect ratio to metadata. (0)
 ///    - `maketx:nomipmap` (int) :  If nonzero, only output the top MIP level (0).
diff --git a/src/libOpenImageIO/maketexture.cpp b/src/libOpenImageIO/maketexture.cpp
index ca38206523..533cf7dc61 100644
--- a/src/libOpenImageIO/maketexture.cpp
+++ b/src/libOpenImageIO/maketexture.cpp
@@ -510,12 +510,11 @@ bump_to_bumpslopes(ImageBuf& dst, const ImageBuf& src,
             return false;
         }
         is_height = false;
-    } else if (Strutil::iequals(
-                   bumpformat,
-                   "auto")) {  // guess input bump format by analyzing channel count and component
+    } else if (Strutil::iequals(bumpformat, "auto")) {
+        // guess input bump format by analyzing channel count and component
         if (src.spec().nchannels > 2
-            && !ImageBufAlgo::isMonochrome(src))  // maybe it's a normal map?
-            is_height = false;
+            && !ImageBufAlgo::isMonochrome(src, 0.0f, ROI(), nthreads))
+            is_height = false;  // maybe it's a normal map?
         else
             is_height = true;
     } else {
@@ -705,6 +704,8 @@ write_mipmap(ImageBufAlgo::MakeTextureMode mode, std::shared_ptr<ImageBuf>& img,
     ImageSpec outspec      = outspec_template;
     outspec.set_format(outputdatatype);
 
+    int nthreads = configspec.get_int_attribute("maketx:threads");
+
     // Going from float to half is prone to generating Inf values if we had
     // any floats that were out of the range that half can represent. Nobody
     // wants Inf in textures; better to clamp.
@@ -783,7 +784,8 @@ write_mipmap(ImageBufAlgo::MakeTextureMode mode, std::shared_ptr<ImageBuf>& img,
 
     if (clamp_half) {
         std::shared_ptr<ImageBuf> tmp(new ImageBuf);
-        ImageBufAlgo::clamp(*tmp, *img, -HALF_MAX, HALF_MAX, true);
+        ImageBufAlgo::clamp(*tmp, *img, -HALF_MAX, HALF_MAX, true, ROI(),
+                            nthreads);
         std::swap(tmp, img);
     }
     if (!img->write(out)) {
@@ -831,7 +833,8 @@ write_mipmap(ImageBufAlgo::MakeTextureMode mode, std::shared_ptr<ImageBuf>& img,
                     std::shared_ptr<ImageBuf> t(new ImageBuf(smallspec));
                     ImageBufAlgo::channels(*t, *small, outspec.nchannels,
                                            cspan<int>(), cspan<float>(),
-                                           cspan<std::string>(), true);
+                                           cspan<std::string>(), true,
+                                           nthreads);
                     std::swap(t, small);
                 }
                 smallspec.tile_width  = outspec.tile_width;
@@ -863,7 +866,7 @@ write_mipmap(ImageBufAlgo::MakeTextureMode mode, std::shared_ptr<ImageBuf>& img,
                 // and pixel windows match.  Don't worry, the texture
                 // engine doesn't care what the upper MIP levels have
                 // for the window sizes, it uses level 0 to determine
-                // the relatinship between texture 0-1 space (display
+                // the relationship between texture 0-1 space (display
                 // window) and the pixels.
                 smallspec.x      = 0;
                 smallspec.y      = 0;
@@ -875,12 +878,11 @@ write_mipmap(ImageBufAlgo::MakeTextureMode mode, std::shared_ptr<ImageBuf>& img,
 
                 if (filtername == "box" && !orig_was_overscan
                     && sharpen <= 0.0f) {
-                    ImageBufAlgo::parallel_image(get_roi(small->spec()),
-                                                 std::bind(resize_block,
-                                                           std::ref(*small),
-                                                           std::cref(*img), _1,
-                                                           envlatlmode,
-                                                           allow_shift));
+                    ImageBufAlgo::parallel_image(
+                        get_roi(small->spec()), paropt(nthreads), [&](ROI roi) {
+                            resize_block(*small, *img, roi, envlatlmode,
+                                         allow_shift);
+                        });
                 } else {
                     Filter2D* filter = setup_filter(small->spec(), img->spec(),
                                                     filtername);
@@ -901,38 +903,44 @@ write_mipmap(ImageBufAlgo::MakeTextureMode mode, std::shared_ptr<ImageBuf>& img,
                         OIIO::print(outstream, "\n");
                     }
                     if (do_highlight_compensation)
-                        ImageBufAlgo::rangecompress(*img, *img);
+                        ImageBufAlgo::rangecompress(*img, *img, false, ROI(),
+                                                    nthreads);
                     if (sharpen > 0.0f && sharpen_first) {
                         std::shared_ptr<ImageBuf> sharp(new ImageBuf);
                         bool uok = ImageBufAlgo::unsharp_mask(*sharp, *img,
-                                                              sharpenfilt, 3.0,
-                                                              sharpen, 0.0f);
+                                                              sharpenfilt, 3.0f,
+                                                              sharpen, 0.0f,
+                                                              ROI(), nthreads);
                         if (!uok)
                             errorfmt("{}", sharp->geterror());
                         std::swap(img, sharp);
                     }
                     ImageBufAlgo::resize(*small, *img,
-                                         { make_pv("filterptr", filter) });
+                                         { make_pv("filterptr", filter) },
+                                         ROI(), nthreads);
                     if (sharpen > 0.0f && !sharpen_first) {
                         std::shared_ptr<ImageBuf> sharp(new ImageBuf);
                         bool uok = ImageBufAlgo::unsharp_mask(*sharp, *small,
-                                                              sharpenfilt, 3.0,
-                                                              sharpen, 0.0f);
+                                                              sharpenfilt, 3.0f,
+                                                              sharpen, 0.0f,
+                                                              ROI(), nthreads);
                         if (!uok)
                             errorfmt("{}", sharp->geterror());
                         std::swap(small, sharp);
                     }
                     if (do_highlight_compensation) {
-                        ImageBufAlgo::rangeexpand(*small, *small);
+                        ImageBufAlgo::rangeexpand(*small, *small, false, ROI(),
+                                                  nthreads);
                         ImageBufAlgo::clamp(*small, *small, 0.0f,
                                             std::numeric_limits<float>::max(),
-                                            true);
+                                            true, ROI(), nthreads);
                     }
                     Filter2D::destroy(filter);
                 }
             }
             if (clamp_half)
-                ImageBufAlgo::clamp(*small, *small, -HALF_MAX, HALF_MAX, true);
+                ImageBufAlgo::clamp(*small, *small, -HALF_MAX, HALF_MAX, true,
+                                    ROI(), nthreads);
 
             double this_miptime = miptimer();
             stat_miptime += this_miptime;
@@ -1093,6 +1101,8 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
     if (!configspec.tile_depth)
         configspec.tile_depth = 1;
 
+    int nthreads = configspec.get_int_attribute("maketx:threads");
+
     bool ignore_unassoc = configspec.get_int_attribute("maketx:ignore_unassoc");
     ImageSpec inconfig;
     if (ignore_unassoc)
@@ -1257,8 +1267,7 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
         bool ok = true;
         OIIO_DISPATCH_COMMON_TYPES(ok, "lightprobe_to_envlatl",
                                    lightprobe_to_envlatl, src->spec().format,
-                                   *latlong, *src, true);
-        // lightprobe_to_envlatl(*latlong, *src, true);
+                                   *latlong, *src, true, ROI(), nthreads);
         // Carry on with the lat-long environment map from here on out
         mode = ImageBufAlgo::MakeTxEnvLatl;
         src  = latlong;
@@ -1269,7 +1278,8 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
         if (Strutil::iequals(configspec.get_string_attribute("maketx:bumprange",
                                                              "auto"),
                              "auto"))
-            src_pixel_stats = ImageBufAlgo::computePixelStats(*src);
+            src_pixel_stats = ImageBufAlgo::computePixelStats(*src, ROI(),
+                                                              nthreads);
 
         ImageSpec newspec  = src->spec();
         newspec.tile_width = newspec.tile_height = 0;
@@ -1286,7 +1296,8 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
         bool ok;
         OIIO_DISPATCH_COMMON_TYPES(ok, "bump_to_bumpslopes", bump_to_bumpslopes,
                                    src->spec().format, *bumpslopes, *src,
-                                   configspec, src_pixel_stats, outstream);
+                                   configspec, src_pixel_stats, outstream,
+                                   ROI(), nthreads);
         // bump_to_bumpslopes(*bumpslopes, *src);
         mode = ImageBufAlgo::MakeTxTexture;
         src  = bumpslopes;
@@ -1330,7 +1341,8 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
         std::vector<imagesize_t> hist;
 
         for (int i = 0; i < channels; i++) {
-            hist = ImageBufAlgo::histogram(*src, i, bins, 0.0f, 1.0f);
+            hist = ImageBufAlgo::histogram(*src, i, bins, 0.0f, 1.0f, false,
+                                           ROI(), nthreads);
 
             // Turn the histogram into a non-normalized CDF
             for (uint64_t j = 1; j < bins; j++) {
@@ -1389,7 +1401,7 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
     bool compute_stats = (constant_color_detect || opaque_detect
                           || compute_average_color || monochrome_detect);
     if (compute_stats) {
-        pixel_stats = ImageBufAlgo::computePixelStats(*src);
+        pixel_stats = ImageBufAlgo::computePixelStats(*src, ROI(), nthreads);
     }
     double stat_pixelstatstime = alltime.lap();
     STATUS("pixelstats", stat_pixelstatstime);
@@ -1420,7 +1432,7 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
             newspec.full_height = newspec.height;
             newspec.full_depth  = newspec.depth;
             src->reset(newspec);
-            ImageBufAlgo::fill(*src, constantColor);
+            ImageBufAlgo::fill(*src, constantColor, ROI(), nthreads);
             if (verbose) {
                 outstream << "  Constant color image detected. ";
                 outstream << "Creating " << newspec.width << "x"
@@ -1441,7 +1453,7 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
         std::shared_ptr<ImageBuf> newsrc(new ImageBuf(src->spec()));
         ImageBufAlgo::channels(*newsrc, *src, src->nchannels() - 1,
                                cspan<int>(), cspan<float>(),
-                               cspan<std::string>(), true);
+                               cspan<std::string>(), true, nthreads);
         std::swap(src, newsrc);  // N.B. the old src will delete
     }
 
@@ -1455,14 +1467,14 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
         && src->spec().alpha_channel < 0
         && pixel_stats.avg[0] == pixel_stats.avg[1]
         && pixel_stats.avg[0] == pixel_stats.avg[2]
-        && ImageBufAlgo::isMonochrome(*src)) {
+        && ImageBufAlgo::isMonochrome(*src, 0.0f, ROI(), nthreads)) {
         if (verbose)
             OIIO::print(
                 outstream,
                 "  Monochrome image detected. Converting to single channel texture.\n");
         std::shared_ptr<ImageBuf> newsrc(new ImageBuf(src->spec()));
         ImageBufAlgo::channels(*newsrc, *src, 1, cspan<int>(), cspan<float>(),
-                               cspan<std::string>(), true);
+                               cspan<std::string>(), true, nthreads);
         newsrc->specmod().default_channel_names();
         std::swap(src, newsrc);
     }
@@ -1475,7 +1487,8 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
                       << std::endl;
         std::shared_ptr<ImageBuf> newsrc(new ImageBuf(src->spec()));
         ImageBufAlgo::channels(*newsrc, *src, nchannels, cspan<int>(),
-                               cspan<float>(), cspan<std::string>(), true);
+                               cspan<float>(), cspan<std::string>(), true,
+                               nthreads);
         std::swap(src, newsrc);
     }
 
@@ -1663,7 +1676,8 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
         && (srcspec.format.basetype == TypeDesc::FLOAT
             || srcspec.format.basetype == TypeDesc::HALF
             || srcspec.format.basetype == TypeDesc::DOUBLE)
-        && !ImageBufAlgo::fixNonFinite(*src, *src, fixmode, &pixelsFixed)) {
+        && !ImageBufAlgo::fixNonFinite(*src, *src, fixmode, &pixelsFixed, ROI(),
+                                       nthreads)) {
         errorfmt("Error fixing nans/infs.");
         return false;
     }
@@ -1677,7 +1691,7 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
             || srcspec.format.basetype == TypeDesc::HALF
             || srcspec.format.basetype == TypeDesc::DOUBLE)) {
         int found_nonfinite = 0;
-        ImageBufAlgo::parallel_image(get_roi(srcspec),
+        ImageBufAlgo::parallel_image(get_roi(srcspec), nthreads,
                                      std::bind(check_nan_block, std::ref(*src),
                                                _1, std::ref(found_nonfinite)));
         if (found_nonfinite) {
@@ -1736,7 +1750,7 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
             outstream << "  Unpremulting image..." << std::endl;
 
         if (!ImageBufAlgo::colorconvert(*ccSrc, *src, processor.get(),
-                                        unpremult)) {
+                                        unpremult, ROI(), nthreads)) {
             errorfmt("Error applying color conversion to image.");
             return false;
         }
@@ -1838,10 +1852,12 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
         toplevel.reset(new ImageBuf(dstspec));
         if ((resize_filter == "box" || resize_filter == "triangle")
             && !orig_was_overscan) {
-            ImageBufAlgo::parallel_image(
-                get_roi(dstspec),
-                std::bind(resize_block, std::ref(*toplevel), std::cref(*src),
-                          _1, envlatlmode, allow_shift != 0));
+            ImageBufAlgo::parallel_image(get_roi(dstspec), nthreads,
+                                         [&](ROI roi) {
+                                             resize_block(*toplevel, *src, roi,
+                                                          envlatlmode,
+                                                          allow_shift != 0);
+                                         });
         } else {
             Filter2D* filter = setup_filter(toplevel->spec(), src->spec(),
                                             resize_filter);
@@ -1850,7 +1866,8 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
                 return false;
             }
             ImageBufAlgo::resize(*toplevel, *src,
-                                 { make_pv("filterptr", filter) });
+                                 { make_pv("filterptr", filter) }, ROI(),
+                                 nthreads);
             Filter2D::destroy(filter);
         }
     }
@@ -1906,12 +1923,11 @@ make_texture_impl(ImageBufAlgo::MakeTextureMode mode, const ImageBuf* input,
         addlHashData << "keepaspect=1 ";
 
     const int sha1_blocksize = 256;
-    std::string hash_digest
-        = configspec.get_int_attribute("maketx:hash", 1)
-              ? ImageBufAlgo::computePixelHashSHA1(*toplevel,
-                                                   addlHashData.str(),
-                                                   ROI::All(), sha1_blocksize)
-              : "";
+    std::string hash_digest  = configspec.get_int_attribute("maketx:hash", 1)
+                                   ? ImageBufAlgo::computePixelHashSHA1(
+                                      *toplevel, addlHashData.str(), ROI::All(),
+                                      sha1_blocksize, nthreads)
+                                   : "";
     if (hash_digest.length()) {
         if (out->supports("arbitrary_metadata")) {
             dstspec.attribute("oiio:SHA-1", hash_digest);

From 94364135c3fd03ef16325138bff88c5c14f44f2f Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Wed, 28 Jan 2026 07:54:22 -0800
Subject: [PATCH 35/70] api(ImageBuf):
 IB::localpixels_as_[writable_]byte_image_span (#5011)

Utilities to ask the IB for the local pixels as an untyped span of
bytes. This is a "safe" alternative to `localpixels()`, which just
returned a single pointer, instead returning an image_span that
understands the sizes and strides of the buffer.

---------

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/include/OpenImageIO/imagebuf.h | 10 ++++++++++
 src/libOpenImageIO/imagebuf.cpp    | 16 ++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/include/OpenImageIO/imagebuf.h b/src/include/OpenImageIO/imagebuf.h
index adc6709d48..eb0227688d 100644
--- a/src/include/OpenImageIO/imagebuf.h
+++ b/src/include/OpenImageIO/imagebuf.h
@@ -1369,6 +1369,16 @@ class OIIO_API ImageBuf {
     void* localpixels();
     const void* localpixels() const;
 
+    /// Return an `image_span<const std::byte>` giving the extent and layout
+    /// of "local" pixel memory, if they are fully in RAM and not backed by an
+    /// ImageCache, or an empty span otherwise.
+    image_span<const std::byte> localpixels_as_byte_image_span() const;
+
+    /// Return an `image_span<std::byte>` giving the extent and layout of
+    /// "local" pixel memory, if they are fully in RAM and not backed by an
+    /// ImageCache, and it is a writable IB, or an empty span otherwise.
+    image_span<std::byte> localpixels_as_writable_byte_image_span();
+
     /// Pixel-to-pixel stride within the localpixels memory.
     stride_t pixel_stride() const;
     /// Scanline-to-scanline stride within the localpixels memory.
diff --git a/src/libOpenImageIO/imagebuf.cpp b/src/libOpenImageIO/imagebuf.cpp
index 403a22c545..fdf14fe447 100644
--- a/src/libOpenImageIO/imagebuf.cpp
+++ b/src/libOpenImageIO/imagebuf.cpp
@@ -2140,6 +2140,22 @@ ImageBuf::localpixels()
 
 
+image_span<const std::byte>
+ImageBuf::localpixels_as_byte_image_span() const
+{
+    return m_impl->m_bufspan;
+}
+
+
+
+image_span<std::byte>
+ImageBuf::localpixels_as_writable_byte_image_span()
+{
+    return m_impl->m_readonly ? image_span<std::byte>() : m_impl->m_bufspan;
+}
+
+
+
 const void*
 ImageBuf::localpixels() const
 {

From c358b74e7c2f148e6e1e8e22ab0d934bf692ee30 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Wed, 28 Jan 2026 19:13:00 +0100
Subject: [PATCH 36/70] fix(heif): Error saving multiple images with different
 bit depths (#5018)

This variable should not have been static.

Signed-off-by: Brecht Van Lommel <brecht@blender.org>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/heif.imageio/heifoutput.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/heif.imageio/heifoutput.cpp b/src/heif.imageio/heifoutput.cpp
index 309bc53ac5..315f6b1a60 100644
--- a/src/heif.imageio/heifoutput.cpp
+++ b/src/heif.imageio/heifoutput.cpp
@@ -128,7 +128,7 @@ HeifOutput::open(const std::string& name, const ImageSpec& newspec,
     try {
         m_ctx.reset(new heif::Context);
         m_himage = heif::Image();
-        static heif_chroma chromas[/*nchannels*/]
+        const heif_chroma chromas[/*nchannels*/]
             = { heif_chroma_undefined, heif_chroma_monochrome,
                 heif_chroma_undefined,
                 (m_bitdepth == 8) ? heif_chroma_interleaved_RGB

From 2b9dd6d27251084bf3b6fa503fade6c66368fa14 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Wed, 28 Jan 2026 19:17:32 +0100
Subject: [PATCH 37/70] feat(heif): Add IOProxy for input and output (#5017)

Add IOProxy support similar to other file formats.

MyHeifWriter was renamed for consistency with other code.

All input and output now goes through the proxy, so this is covered by
existing tests.

Signed-off-by: Brecht Van Lommel <brecht@blender.org>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/doc/builtinplugins.rst      |  8 ++++++
 src/heif.imageio/heifinput.cpp  | 50 ++++++++++++++++++++++++++++-----
 src/heif.imageio/heifoutput.cpp | 37 +++++++++++-------------
 3 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/src/doc/builtinplugins.rst b/src/doc/builtinplugins.rst
index f6eea37efd..05c66fe352 100644
--- a/src/doc/builtinplugins.rst
+++ b/src/doc/builtinplugins.rst
@@ -806,6 +806,10 @@ attributes are supported:
        having Orientation 1). If zero, then libheif will not reorient the
        image and the Orientation metadata will be set to reflect the camera
        orientation.
+   * - ``oiio:ioproxy``
+     - ptr
+     - Pointer to a ``Filesystem::IOProxy`` that will handle the I/O, for
+       example by reading from memory rather than the file system.
 
 **Configuration settings for HEIF output**
 
@@ -824,6 +828,10 @@ control aspects of the writing itself:
      - If supplied, can be ``"heic"`` or ``"avif"``, but may optionally have a
        quality value appended, like ``"heic:90"``. Quality can be 1-100, with
        100 meaning lossless. The default is 75.
+   * - ``oiio:ioproxy``
+     - ptr
+     - Pointer to a ``Filesystem::IOProxy`` that will handle the I/O, for
+       example by writing to memory rather than the file system.
 
 
diff --git a/src/heif.imageio/heifinput.cpp b/src/heif.imageio/heifinput.cpp
index 9e0230a86c..f4b78aae65 100644
--- a/src/heif.imageio/heifinput.cpp
+++ b/src/heif.imageio/heifinput.cpp
@@ -32,6 +32,35 @@
 
 OIIO_PLUGIN_NAMESPACE_BEGIN
 
+
+class HeifReader final : public heif::Context::Reader {
+public:
+    HeifReader(Filesystem::IOProxy* ioproxy)
+        : m_ioproxy(ioproxy)
+    {
+        m_ioproxy->seek(0);
+    }
+    int64_t get_position() const override { return m_ioproxy->tell(); }
+    int read(void* data, size_t size) override
+    {
+        return m_ioproxy->read(data, size) == size ? 0 : -1;
+    }
+    int seek(int64_t position) override
+    {
+        return m_ioproxy->seek(position) ? 0 : -1;
+    }
+    heif_reader_grow_status wait_for_file_size(int64_t target_size) override
+    {
+        return target_size <= int64_t(m_ioproxy->size())
+                   ? heif_reader_grow_status_size_reached
+                   : heif_reader_grow_status_size_beyond_eof;
+    }
+
+private:
+    Filesystem::IOProxy* m_ioproxy;
+};
+
+
 class HeifInput final : public ImageInput {
 public:
     HeifInput() {}
@@ -39,13 +68,13 @@ class HeifInput final : public ImageInput {
     const char* format_name(void) const override { return "heif"; }
     int supports(string_view feature) const override
     {
-        return feature == "exif"
+        return feature == "exif" || feature == "ioproxy"
 #if LIBHEIF_HAVE_VERSION(1, 9, 0)
                || feature == "cicp"
 #endif
             ;
     }
-    bool valid_file(const std::string& filename) const override;
+    bool valid_file(Filesystem::IOProxy* ioproxy) const override;
     bool open(const std::string& name, ImageSpec& newspec) override;
     bool open(const std::string& name, ImageSpec& newspec,
               const ImageSpec& config) override;
@@ -67,6 +96,7 @@ class HeifInput final : public ImageInput {
     bool m_do_associate            = false;
     bool m_reorient                = true;
     std::unique_ptr<heif::Context> m_ctx;
+    std::unique_ptr<HeifReader> m_reader;
     heif_item_id m_primary_id;             // id of primary image
     std::vector<heif_item_id> m_item_ids;  // ids of all other images
     heif::ImageHandle m_ihandle;
@@ -74,7 +104,6 @@ class HeifInput final : public ImageInput {
 };
 
 
-
 void
 oiio_heif_init()
 {
@@ -111,10 +140,12 @@ OIIO_PLUGIN_EXPORTS_END
 
 
 bool
-HeifInput::valid_file(const std::string& filename) const
+HeifInput::valid_file(Filesystem::IOProxy* ioproxy) const
 {
+    if (!ioproxy || ioproxy->mode() != Filesystem::IOProxy::Mode::Read)
+        return false;
     uint8_t magic[12];
-    if (Filesystem::read_bytes(filename, magic, sizeof(magic)) != sizeof(magic))
+    if (ioproxy->pread(magic, sizeof(magic), 0) != sizeof(magic))
         return false;
     heif_filetype_result filetype_check = heif_check_filetype(magic,
                                                               sizeof(magic));
@@ -141,7 +172,12 @@ HeifInput::open(const std::string& name, ImageSpec& newspec,
     m_filename = name;
     m_subimage = -1;
 
+    ioproxy_retrieve_from_config(config);
+    if (!ioproxy_use_or_open(name))
+        return false;
+
     m_ctx.reset(new heif::Context);
+    m_reader.reset(new HeifReader(ioproxy()));
     m_himage  = heif::Image();
     m_ihandle = heif::ImageHandle();
 
@@ -150,8 +186,7 @@ HeifInput::open(const std::string& name, ImageSpec& newspec,
     m_reorient = config.get_int_attribute("oiio:reorient", 1);
 
     try {
-        m_ctx->read_from_file(name);
-        // FIXME: should someday be read_from_reader to give full flexibility
+        m_ctx->read_from_reader(*m_reader);
 
         m_item_ids   = m_ctx->get_list_of_top_level_image_IDs();
         m_primary_id = m_ctx->get_primary_image_ID();
@@ -187,6 +222,7 @@ HeifInput::close()
     m_himage  = heif::Image();
     m_ihandle = heif::ImageHandle();
     m_ctx.reset();
+    m_reader.reset();
     m_subimage                = -1;
     m_num_subimages           = 0;
     m_associated_alpha        = true;
diff --git a/src/heif.imageio/heifoutput.cpp b/src/heif.imageio/heifoutput.cpp
index 315f6b1a60..8cfa40afd7 100644
--- a/src/heif.imageio/heifoutput.cpp
+++ b/src/heif.imageio/heifoutput.cpp
@@ -29,7 +29,8 @@ class HeifOutput final : public ImageOutput {
     const char* format_name(void) const override { return "heif"; }
     int supports(string_view feature) const override
     {
-        return feature == "alpha" || feature == "exif" || feature == "tiles"
+        return feature == "alpha" || feature == "exif" || feature == "ioproxy"
+               || feature == "tiles"
 #if LIBHEIF_HAVE_VERSION(1, 9, 0)
                || feature == "cicp"
 #endif
@@ -58,12 +59,9 @@ class HeifOutput final : public ImageOutput {
 };
 
 
-
-namespace {
-
-class MyHeifWriter final : public heif::Context::Writer {
+class HeifWriter final : public heif::Context::Writer {
 public:
-    MyHeifWriter(Filesystem::IOProxy* ioproxy)
+    HeifWriter(Filesystem::IOProxy* ioproxy)
         : m_ioproxy(ioproxy)
     {
     }
@@ -84,9 +82,6 @@ class MyHeifWriter final : public heif::Context::Writer {
     Filesystem::IOProxy* m_ioproxy = nullptr;
 };
 
-}  // namespace
-
-
 
 OIIO_PLUGIN_EXPORTS_BEGIN
 
@@ -114,6 +109,11 @@ HeifOutput::open(const std::string& name, const ImageSpec& newspec,
 
     m_filename = name;
 
+    ioproxy_retrieve_from_config(m_spec);
+    if (!ioproxy_use_or_open(name)) {
+        return false;
+    }
+
     m_bitdepth = m_spec.format.size() > TypeUInt8.size() ? 10 : 8;
     m_bitdepth = m_spec.get_int_attribute("oiio:BitsPerSample", m_bitdepth);
     if (m_bitdepth == 10 || m_bitdepth == 12) {
@@ -221,7 +221,9 @@ HeifOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
 bool
 HeifOutput::close()
 {
-    if (!m_ctx) {  // already closed
+    if (!m_ctx || !ioproxy_opened()) {  // already closed
+        m_ctx.reset();
+        ioproxy_clear();
         return true;
     }
 
@@ -286,25 +288,20 @@ HeifOutput::close()
 #endif
         }
         m_ctx->set_primary_image(m_ihandle);
-        Filesystem::IOFile ioproxy(m_filename, Filesystem::IOProxy::Write);
-        if (ioproxy.mode() != Filesystem::IOProxy::Write) {
-            errorfmt("Could not open \"{}\"", m_filename);
-            ok = false;
-        } else {
-            MyHeifWriter writer(&ioproxy);
-            m_ctx->write(writer);
-        }
+        HeifWriter writer(ioproxy());
+        m_ctx->write(writer);
     } catch (const heif::Error& err) {
         std::string e = err.get_message();
         errorfmt("{}", e.empty() ? "unknown exception" : e.c_str());
-        return false;
+        ok = false;
     } catch (const std::exception& err) {
         std::string e = err.what();
         errorfmt("{}", e.empty() ? "unknown exception" : e.c_str());
-        return false;
+        ok = false;
     }
 
     m_ctx.reset();
+    ioproxy_clear();
     return ok;
 }
 

From 70f643db69588d6d74e3621862292fa4b3d0ee65 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Thu, 29 Jan 2026 23:56:15 +0100
Subject: [PATCH 38/70] fix(webp): Missing oiio:UnassociatedAlpha on input
 (#5020)

Like other file formats, the returned ImageSpec should indicate if the
image contains unassociated alpha.

This was an oversight in #4770.

Test added.

Signed-off-by: Brecht Van Lommel <brecht@blender.org>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/webp.imageio/webpinput.cpp     | 5 ++++-
 testsuite/webp/ref/out-webp1.1.txt | 6 ++++++
 testsuite/webp/run.py              | 3 +++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/webp.imageio/webpinput.cpp b/src/webp.imageio/webpinput.cpp
index 09cc08c4cd..bbe35d9822 100644
--- a/src/webp.imageio/webpinput.cpp
+++ b/src/webp.imageio/webpinput.cpp
@@ -214,8 +214,11 @@ WebpInput::open(const std::string& name, ImageSpec& spec,
     // Make space for the decoded image
     m_decoded_image.reset(new uint8_t[m_spec.image_bytes()]);
 
-    if (config.get_int_attribute("oiio:UnassociatedAlpha", 0) == 1)
+    if (config.get_int_attribute("oiio:UnassociatedAlpha", 0) == 1) {
         m_keep_unassociated_alpha = true;
+        if (m_spec.alpha_channel != -1)
+            m_spec.attribute("oiio:UnassociatedAlpha", 1);
+    }
 
     seek_subimage(0, 0);
     spec = m_spec;
diff --git a/testsuite/webp/ref/out-webp1.1.txt b/testsuite/webp/ref/out-webp1.1.txt
index f7e0d095de..92ec6d9f0f 100644
--- a/testsuite/webp/ref/out-webp1.1.txt
+++ b/testsuite/webp/ref/out-webp1.1.txt
@@ -18,3 +18,9 @@ Reading ../oiio-images/webp/4.webp
     SHA-1: 8F42E3DCCE6FE15146BA06C440C15B7831F60572
     channel list: R, G, B
     oiio:ColorSpace: "srgb_rec709_scene"
+Reading rgba.webp
+rgba.webp            :   64 x   64, 4 channel, uint8 webp
+    SHA-1: 897256B6709E1A4DA9DABA92B6BDE39CCFCCD8C1
+    channel list: R, G, B, A
+    oiio:ColorSpace: "srgb_rec709_scene"
+    oiio:UnassociatedAlpha: 1
diff --git a/testsuite/webp/run.py b/testsuite/webp/run.py
index a0a5c53510..3e61dd5d81 100755
--- a/testsuite/webp/run.py
+++ b/testsuite/webp/run.py
@@ -11,3 +11,6 @@
     # a lossy format and is not stable under the round trip
     # command += rw_command (OIIO_TESTSUITE_IMAGEDIR, f,
     #                        extraargs='-attrib compression lossless')
+
+command += oiiotool ("--create 64x64 4 -o rgba.webp")
+command += info_command ("rgba.webp", "--iconfig oiio:UnassociatedAlpha 1", safematch=True)

From a513661ce712601a1ccdeb3ce9072c7396b5db24 Mon Sep 17 00:00:00 2001
From: Zach Lewis <zachlewis@users.noreply.github.com>
Date: Fri, 30 Jan 2026 13:43:49 -0500
Subject: [PATCH 39/70] build(ocio): bump build ver to 2.5.1 (#5022)

Makes OpenColorIO 2.5.1 the default version when building OCIO locally.
This will cause the released python wheels to build against the
currently-latest version of OCIO.

Signed-off-by: Zach Lewis <zachcanbereached@gmail.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/cmake/build_OpenColorIO.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cmake/build_OpenColorIO.cmake b/src/cmake/build_OpenColorIO.cmake
index 8502992696..ea68c003c6 100644
--- a/src/cmake/build_OpenColorIO.cmake
+++ b/src/cmake/build_OpenColorIO.cmake
@@ -6,7 +6,7 @@
 # OpenColorIO by hand!
 ######################################################################
 
-set_cache (OpenColorIO_BUILD_VERSION 2.4.2 "OpenColorIO version for local builds")
+set_cache (OpenColorIO_BUILD_VERSION 2.5.1 "OpenColorIO version for local builds")
 set (OpenColorIO_GIT_REPOSITORY "https://github.com/AcademySoftwareFoundation/OpenColorIO")
 set (OpenColorIO_GIT_TAG "v${OpenColorIO_BUILD_VERSION}")
 set_cache (OpenColorIO_BUILD_SHARED_LIBS  OFF

From bbdd55a635d0c691aba775c4fc22e34494342c50 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 31 Jan 2026 13:41:10 -0800
Subject: [PATCH 40/70] ci: lock bleeding edge to pybind11 latest version
 (#5024)

There's something in pybind11 master at the moment that is crashing in
its destructors. It's been causing our "bleeding edge" test to fail for
over a week now. I'm tired of our test failing, so I'm locking down to
the last known working version. Will check back periodically and return
to testing against pybind11 master after they have fixed it.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bb8b86f022..a227cdbd7b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -482,7 +482,7 @@ jobs:
             fmt_ver: master
             opencolorio_ver: main
             openexr_ver: main
-            pybind11_ver: master
+            pybind11_ver: v3.0.1
             python_ver: "3.12"
             simd: avx2,f16c
             benchmark: 1

From 1581ba888bdce40508eaa2fff9a2ccf0aeb35d06 Mon Sep 17 00:00:00 2001
From: Valery Angelique <71804886+vangeliq@users.noreply.github.com>
Date: Sat, 31 Jan 2026 17:52:00 -0800
Subject: [PATCH 41/70] feat(iv): flip, rotate and save image (#5003)

Fixes #4715

- Added save functionality
- Added flipping and rotation capabilities via metadata.
- Hotkeys: Ctrl-Shift-R and L to rotate the image left and right.

Note that the actual pixel data remains unchanged, so only image formats
that support the rotation metadata will reflect the changes.

I thought that it made sense for the flip and rotate to be under the
tool section, similar to where the macbook Preview app places it.
Keyboard shortcuts also follow the shortcuts that Preview uses.

Tests

Used a .jpeg photo to test:
- saving
- making changes with/without saving and re-opening the image
- flipping and rotation from different initial rotation/mirrored states

Otherwise there doesn't seem to be a test suite for iv?

Signed-off-by: valery <71804886+vangeliq@users.noreply.github.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/iv/imageviewer.cpp | 100 +++++++++++++++++++++++++++++++++++++++++
 src/iv/imageviewer.h   |  15 +++++++
 2 files changed, 115 insertions(+)

diff --git a/src/iv/imageviewer.cpp b/src/iv/imageviewer.cpp
index 12db1aa772..f4a4924a5e 100644
--- a/src/iv/imageviewer.cpp
+++ b/src/iv/imageviewer.cpp
@@ -520,6 +520,22 @@ ImageViewer::createActions()
     closeupAvgPixelsBox->setToolTip(closeupAvgPixelsTooltip);
     closeupAvgPixelsLabel->setToolTip(closeupAvgPixelsTooltip);
 
+
+    rotateLeftAct = new QAction(tr("&Rotate Left"), this);
+    rotateLeftAct->setShortcut(tr("Ctrl+Shift+L"));
+    connect(rotateLeftAct, SIGNAL(triggered()), this, SLOT(rotateLeft()));
+
+    rotateRightAct = new QAction(tr("&Rotate Right"), this);
+    rotateRightAct->setShortcut(tr("Ctrl+Shift+R"));
+    connect(rotateRightAct, SIGNAL(triggered()), this, SLOT(rotateRight()));
+
+    flipHorizontalAct = new QAction(tr("&Flip Horizontal"), this);
+    connect(flipHorizontalAct, SIGNAL(triggered()), this,
+            SLOT(flipHorizontal()));
+
+    flipVerticalAct = new QAction(tr("&Flip Vertical"), this);
+    connect(flipVerticalAct, SIGNAL(triggered()), this, SLOT(flipVertical()));
+
     // Connect signals to ensure closeupAvgPixelsBox value is always <= closeupPixelsBox value
     connect(closeupPixelsBox, QOverload<int>::of(&QSpinBox::valueChanged),
             [this](int value) {
@@ -780,6 +796,11 @@ ImageViewer::createMenus()
     toolsMenu->addAction(toggleAreaSampleAct);
     toolsMenu->addMenu(slideMenu);
     toolsMenu->addMenu(sortMenu);
+    toolsMenu->addSeparator();
+    toolsMenu->addAction(rotateLeftAct);
+    toolsMenu->addAction(rotateRightAct);
+    toolsMenu->addAction(flipHorizontalAct);
+    toolsMenu->addAction(flipVerticalAct);
 
     // Menus, toolbars, & status
     // Annotate
@@ -2455,3 +2476,82 @@ ImageViewer::areaSampleMode() const
 {
     return m_areaSampleMode;
 }
+
+
+void
+ImageViewer::rotateLeft()
+{
+    IvImage* img = cur();
+    if (!img)
+        return;
+
+    ImageSpec* spec = curspecmod();
+
+    int curr_orientation = spec->get_int_attribute("Orientation", 1);
+
+    if (curr_orientation >= 1 && curr_orientation <= 8) {
+        static int next_orientation[] = { 0, 8, 5, 6, 7, 4, 1, 2, 3 };
+        curr_orientation              = next_orientation[curr_orientation];
+        spec->attribute("Orientation", curr_orientation);
+    }
+    displayCurrentImage();
+}
+
+
+void
+ImageViewer::rotateRight()
+{
+    IvImage* img = cur();
+    if (!img)
+        return;
+
+    ImageSpec* spec      = curspecmod();
+    int curr_orientation = spec->get_int_attribute("Orientation", 1);
+
+    if (curr_orientation >= 1 && curr_orientation <= 8) {
+        static int next_orientation[] = { 0, 6, 7, 8, 5, 2, 3, 4, 1 };
+        curr_orientation              = next_orientation[curr_orientation];
+        spec->attribute("Orientation", curr_orientation);
+    }
+    displayCurrentImage();
+}
+
+
+void
+ImageViewer::flipHorizontal()
+{
+    IvImage* img = cur();
+    if (!img)
+        return;
+
+    ImageSpec* spec = curspecmod();
+
+    int curr_orientation = spec->get_int_attribute("Orientation", 1);
+
+    if (curr_orientation >= 1 && curr_orientation <= 8) {
+        static int next_orientation[] = { 0, 2, 1, 4, 3, 6, 5, 8, 7 };
+        curr_orientation              = next_orientation[curr_orientation];
+        spec->attribute("Orientation", curr_orientation);
+    }
+    displayCurrentImage();
+}
+
+
+void
+ImageViewer::flipVertical()
+{
+    IvImage* img = cur();
+    if (!img)
+        return;
+
+    ImageSpec* spec = curspecmod();
+
+    int curr_orientation = spec->get_int_attribute("Orientation", 1);
+
+    if (curr_orientation >= 1 && curr_orientation <= 8) {
+        static int next_orientation[] = { 0, 4, 3, 2, 1, 8, 7, 6, 5 };
+        curr_orientation              = next_orientation[curr_orientation];
+        spec->attribute("Orientation", curr_orientation);
+    }
+    displayCurrentImage();
+}
\ No newline at end of file
diff --git a/src/iv/imageviewer.h b/src/iv/imageviewer.h
index 3a255647bb..763aaf0dd7 100644
--- a/src/iv/imageviewer.h
+++ b/src/iv/imageviewer.h
@@ -219,6 +219,14 @@ class ImageViewer final : public QMainWindow {
         return img ? &img->spec() : NULL;
     }
 
+    /// Return a modifiable ref to the current image spec, or NULL if there is no
+    /// current image.
+    ImageSpec* curspecmod(void) const
+    {
+        IvImage* img = cur();
+        return img ? &img->specmod() : NULL;
+    }
+
     bool pixelviewOn(void) const
     {
         return showPixelviewWindowAct && showPixelviewWindowAct->isChecked();
@@ -334,6 +342,11 @@ private slots:
     void editPreferences();      ///< Edit viewer preferences
     void toggleAreaSample();     ///< Use area probe
 
+    void rotateLeft();
+    void rotateRight();
+    void flipHorizontal();
+    void flipVertical();
+
     void useOCIOAction(bool checked);
     void ocioColorSpaceAction();
     void ocioDisplayViewAction();
@@ -404,6 +417,8 @@ private slots:
     QAction* showPixelviewWindowAct;
     QAction* toggleAreaSampleAct;
     QAction* toggleWindowGuidesAct;
+    QAction *rotateLeftAct, *rotateRightAct, *flipHorizontalAct,
+        *flipVerticalAct;
     QMenu *fileMenu, *editMenu, /**imageMenu,*/ *viewMenu, *toolsMenu,
         *helpMenu;
     QMenu* openRecentMenu;

From bf494524789a74a42cde57e763554c7a32e8423c Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sun, 1 Feb 2026 12:31:31 -0800
Subject: [PATCH 42/70] CHANGES udpates (#5028)

Reflecting this month's releases and other things that recently went
into main.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 CHANGES.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 CREDITS.md |  3 +++
 2 files changed, 66 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index f1ed5b64c0..df7e647fea 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -6,8 +6,11 @@ Release 3.2 (target: Sept 2026?) -- compared to 3.1
 * *New image file format support:*
 * *oiiotool new features and major improvements*:
 * *Command line utilities*:
+  - *iv*: Flip, rotate and save image [#5003](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5003) (by Valery Angelique) (3.2.0.0)
 * *ImageBuf/ImageBufAlgo*:
+  - *ImageBuf*: IB::localpixels_as_[writable_]byte_image_span [#5011](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5011) (3.2.0.0, 3.1.10.0)
 * *ImageCache/TextureSystem*:
+  - *api/TS*: `IBA::make_texture()` now honors "maketx:threads" hint [#5014](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5014) (3.2.0.0, 3.1.10.0)
 * New global attribute queries via OIIO::getattribute():
 * Miscellaneous API changes:
   - *api*: Versioned namespace to preserve ABI compatibility between minor releases [#4869](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4869) (3.2.0.0)
@@ -17,12 +20,20 @@ Release 3.2 (target: Sept 2026?) -- compared to 3.1
   - *openexr*: Write OpenEXR colorInteropID metadata based on oiio:ColorSpace [#4967](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4967) (by Brecht Van Lommel) (3.0.14.0, 3.2.0.0)
   - *jpeg-xl*: CICP read and write support for JPEG-XL [#4968](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4968) (by Brecht Van Lommel) (3.2.0.0, 3.1.9.0)
   - *jpeg-xl*: ICC read and write for JPEG-XL files (issue 4649) [#4905](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4905) (by shanesmith-dwa) (3.0.14.0, 3.2.0.0)
+* Other notable new feature:
+  - *heif*: Add IOProxy support for both input and output [#5017](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5017) (by Brecht Van Lommel) (3.2.0.0, 3.1.10.0)
+
 ### 🚀  Performance improvements
+  - *perf*: `ImageBufAlgo::resample` and `oiiotool --resample` improvements to speed up 20x or more [#4993](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4993) (3.2.0.0, 3.1.10.0)
+
 ### 🐛  Fixes and feature enhancements
   - *IBA*: IBA::compare_Yee() accessed the wrong channel [#4976](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4976) (by Pavan Madduri) (3.2.0.0)
   - *exif*: Support EXIF 3.0 tags [#4961](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4961) (3.2.0.0)
   - *imagebuf*: Fix set_pixels bug, didn't consider roi = All [#4949](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4949) (3.2.0.0)
   - *ffmpeg*: 10 bit video had wrong green channel [#4935](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4935) (by Brecht Van Lommel) (3.2.0.0, 3.1.7.0)
+  - *heif*: Add IOProxy support for both input and output [#5017](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5017) (by Brecht Van Lommel) (3.2.0.0, 3.1.10.0)
+  - *heif*: Fix: Could not output AVIF when libheif has no HEVC support [#5013](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5013) (by Brecht Van Lommel) (3.2.0.0, 3.1.10.0)
+  - *heif*: Fix error saving multiple images with different bit depths [#5018](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5018) (by Brecht Van Lommel) (3.2.0.0, 3.1.10.0)
   - *iff*: Handle non-zero origin, protect against buffer overflows [#4925](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4925) (3.2.0.0, 3.1.7.0)
   - *jpeg*: Fix wrong pointers/crashing when decoding CMYK jpeg files [#4963](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4963) (3.2.0.0)
   - *jpeg-2000*: Type warning in assertion in jpeg2000output.cpp [#4952](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4952) (3.2.0.0)
@@ -37,27 +48,45 @@ Release 3.2 (target: Sept 2026?) -- compared to 3.1
   - *png*: We were not correctly suppressing hint metadata [#4983](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4983) (3.2.0.0)
   - *sgi*: Implement RLE encoding support for output [#4990](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4990) (by Jesse Yurkovich) (3.2.0.0)
   - *webp*: Allow out-of-order scanlines when writing webp [#4973](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4973) (by Pavan Madduri) (3.2.0.0)
+  - *webp*: Use correct resolution limits for WebpOutput::open [#5016](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5016) (by Jesse Yurkovich) (3.2.0.0, 3.1.10.0)
+  - *webp*: Fix missing oiio:UnassociatedAlpha on input [#5020](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5020) (by Brecht Van Lommel) (3.2.0.0, 3.1.10.0)
 ### 🔧  Internals and developer goodies
+  - *fix*: Several bug fixes related to internal use of image_span [#5004](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5004) (3.2.0.0, 3.1.10.0)
   - *filesystem.h*: Speedup to detect the existence of files on Windows [#4977](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4977) (by JacksonSun-adsk) (3.2.0.0)
 ### 🏗  Build/test/CI and platform ports
 * OIIO's CMake build system and scripts:
   - *build*: Allow auto-build of just required packages by setting `OpenImageIO_BUILD_MISSING_DEPS` to `required`. [#4927](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4927) (3.2.0.0, 3.1.7.0)
   - *build*: Make dependency report more clear about what was required [#4929](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4929) (3.2.0.0, 3.1.7.0)
+  - *build*: Fix HARDENING build options [#4996](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4996) (3.2.0.0)
+  - *build*: Fully disable tests when their required dependencies are missing [#5005](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5005) (3.2.0.0, 3.1.10.0)
 * Dependency and platform support:
   - *deps*: Additional auto-build capabilities for dependencies that are not found: GIF library [#4921](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4921) (by Valery Angelique), OpenJPEG [#4911](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4911) (by Danny Greenstein) (3.2.0.0, 3.1.7.0)
   - *deps*: Disable LERC in libTIFF local build script [#4957](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4957) (by LI JI) (3.2.0.0, 3.1.8.0)
   - *deps*: Test against libraw 0.21.5 [#4988](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4988) (3.2.0.0, 3.1.9.0)
+  - *build/platforms*: Fix building on OpenBSD [#5001](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5001) (by Brad Smith) (3.2.0.0, 3.1.10.0)
+  - *build/deps*: Bump OCIO auto-build ver to 2.5.1 [#5022](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5022) (by Zach Lewis) (3.2.0.0, 3.1.10.0)
+  - *build/deps*: Use libheif exported config if available [#5012](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5012) (3.2.0.0, 3.1.10.0)
+  - *build/deps*: Libheif 1.21 support [#4992](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4992) (3.2.0.0, 3.1.10.0)
 * Testing and Continuous integration (CI) systems:
   - *tests*: Image_span_test reduce benchmark load for debug and CI renders [#4951](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4951) (3.2.0.0, 3.1.8.0)
+  - *tests*: Add new ref image for jpeg test [#5007](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5007) (3.2.0.0, 3.1.10.0)
   - *ci*: Python wheel building improvements: use ccache [#4924](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4924) (by Larry Gritz), unbreak wheel release + other enhancements pt 1 [#4937](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4937) (by Zach Lewis) (3.2.0.0, 3.1.7.0)
   - *ci*: Simplify ci workflow by using build-steps for old aswf containers, too [#4932](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4932) (3.2.0.0, 3.1.7.0)
   - *ci*: We were not correctly setting fmt version from job options [#4939](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4939) (3.2.0.0, 3.1.7.0)
   - *ci*: Emergency fix change deprecated sonarqube action [#4969](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4969) (3.2.0.0)
   - *ci*: Try python 3.13 to fix Mac breakage on CI [#4970](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4970) (3.2.0.0)
+  - *ci*: Freetype adjustments [#4999](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4999) (3.2.0.0)
+  - *ci*: Speed up macos15 intel variant by not installing Qt [#4998](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4998) (3.2.0.0, 3.1.10.0)
+  - *ci*: Don't run non-wheel workflows when only pyproject.toml changes [#4997](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4997) (3.2.0.0, 3.1.10.0)
+  - *ci*: Windows runners switched which python version they had [#5010](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5010) (3.2.0.0, 3.1.10.0)
+  - *ci*: Test against libraw 0.22 for 'latest' test variants [#5009](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5009) (3.2.0.0, 3.1.10.0)
+  - *ci*: Lock bleeding edge to pybind11 latest version [#5024](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5024) (3.2.0.0, 3.1.10.0)
 ### 📚  Notable documentation changes
   - *docs*: Update/correct explanation of "openexr:core" attribute, and typo fixes [#4943](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4943) (3.2.0.0, 3.1.7.0)
+  - *docs*: Remove outdated/wrong description in INSTALL.md [#5008](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5008) (3.2.0.0)
 ### 🏢  Project Administration
   - *admin*: Minor rewording in the issue and PR templates [#4982](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4982) (3.2.0.0)
+  - *admin*: Refine PR template to give more visual separation [#4995](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4995) (3.2.0.0)
 ### 🤝  Contributors
 
 ---
@@ -65,6 +94,30 @@ Release 3.2 (target: Sept 2026?) -- compared to 3.1
 
 
+Release 3.1.10.0 (Feb 1, 2026) -- compared to 3.1.9.0
+-----------------------------------------------------
+  - *perf*: `IBA::resample()` and `oiiotool --resample` improvements to speed up 20x or more [#4993](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4993)
+  - *ImageBuf*: IB::localpixels_as_[writable_]byte_image_span [#5011](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5011)
+  - ImageBufAlgo*: IBA::make_texture now honors "maketx:threads" hint [#5014](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5014)
+  - *heif*: Add IOProxy for input and output [#5017](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5017) (by Brecht Van Lommel)
+  - *heif*: Can not output AVIF when libheif has no HEVC support [#5013](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5013) (by Brecht Van Lommel)
+  - *heif*: Error saving multiple images with different bit depths [#5018](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5018) (by Brecht Van Lommel)
+  - *webp*: Use correct resolution limits for WebpOutput::open [#5016](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5016) (by Jesse Yurkovich)
+  - *webp*: Missing oiio:UnassociatedAlpha on input [#5020](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5020) (by Brecht Van Lommel)
+  - *fix*: Several bug fixes related to internal use of image_span [#5004](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5004)
+  - *build*: Fix building on OpenBSD [#5001](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5001) (by Brad Smith)
+  - *deps*: Libheif 1.21 support [#4992](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4992)
+  - *deps*: Bump OCIO auto-build ver to 2.5.1 [#5022](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5022) (by Zach Lewis)
+  - *deps*: Use libheif exported config if available [#5012](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5012)
+  - *tests*: Add new ref image for jpeg test [#5007](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5007)
+  - *tests*: Fully disable tests when their required dependencies are missing [#5005](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5005)
+  - *ci*: Speed up macos15 intel variant by not installing Qt [#4998](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4998)
+  - *ci*: Don't run non-wheel workflows when only pyproject.toml changes [#4997](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4997)
+  - *ci*: Windows runners switched which python version they had [#5010](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5010)
+  - *ci*: Test against libraw 0.22 for 'latest' test variants [#5009](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5009)
+  - *ci*: Lock bleeding edge to pybind11 latest version [#5024](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5024)
+
+
 Release 3.1.9.0 (Jan 1, 2026) -- compared to 3.1.8.0
 ----------------------------------------------------
   - Color management improvements:
@@ -444,6 +497,16 @@ asterisk) had not previously contributed to the project.
 ---
 
 
+Release 3.0.15.0 (Feb 1, 2026) -- compared to 3.0.14.0
+-------------------------------------------------------
+  - *heif*: Can not output AVIF when libheif has no HEVC support [#5013](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5013) (by Brecht Van Lommel)
+  - *heif*: Error saving multiple images with different bit depths [#5018](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5018) (by Brecht Van Lommel)
+  - *webp*: Use correct resolution limits for WebpOutput::open [#5016](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5016) (by Jesse Yurkovich)
+  - *ci*: Speed up macos15 intel variant by not installing Qt [#4998](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4998)
+  - *ci*: Windows runners switched which python version they had [#5010](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5010)
+  - *ci*: Lock bleeding edge to pybind11 latest version [#5024](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/5024)
+
+
 Release 3.0.14.0 (Jan 1, 2026) -- compared to 3.0.13.0
 -------------------------------------------------------
   - *fix(IBA)*: IBA::compare_Yee() accessed the wrong channel [#4976](https://github.com/AcademySoftwareFoundation/OpenImageIO/pull/4976) (by Pavan Madduri)
diff --git a/CREDITS.md b/CREDITS.md
index f3044fd4bf..ed1bb4c15b 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -110,6 +110,7 @@ lg@openimageio.org
 * Imarz
 * Irena Damsky
 * Ismael Cortes
+* Jackson Sun
 * Jan Hettenkofer
 * Jan Honsbrok
 * Jens Lindgren
@@ -191,6 +192,7 @@ lg@openimageio.org
 * Paul Franz
 * Paul Melis
 * Paul Molodowitch
+* Pavan Madduri
 * Pavel Karneliuk
 * Pete Larabell
 * Peter Horvath
@@ -223,6 +225,7 @@ lg@openimageio.org
 * Sergio Rojas
 * Shane Ambler
 * Shane Smith
+* Shashvat K. Singh
 * Simon Boorer
 * Solomon Boulos
 * SRHMorris

From e3f22f72f6cf44d251f2d69e623c4be91f39ab93 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sun, 1 Feb 2026 12:33:10 -0800
Subject: [PATCH 43/70] testing: Adjust test comparision thresholds for Mac ARM
 (#5026)

Even though we have CI testing on Mac with ARM CPU that were passing,
after getting a new laptop, I saw some test failures that were due to
just a few pixels on a few tests needing a higher comparision threshold.
Results are correct, just different due to the math. I guess this
machine (CPU? build flags? specific compiler or library versions?) is
ever so slightly different than the CI Macs, so I caught a few more
instances that needed to be adjusted.

I tried to increase the thresholds as little as possible to fix the
problem.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 testsuite/texture-crop/run.py            | 1 +
 testsuite/texture-interp-bilinear/run.py | 1 +
 testsuite/texture-skinny/run.py          | 1 +
 testsuite/texture-uint8/run.py           | 1 +
 4 files changed, 4 insertions(+)

diff --git a/testsuite/texture-crop/run.py b/testsuite/texture-crop/run.py
index 9973dc6afc..5623052c2e 100755
--- a/testsuite/texture-crop/run.py
+++ b/testsuite/texture-crop/run.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # https://github.com/AcademySoftwareFoundation/OpenImageIO
 
+hardfail = 0.02
 
 command += oiiotool("../common/grid.tif --crop 512x512+200+100 -o grid-crop.tif")
 command += maketx_command ("grid-crop.tif", "grid-crop.tx")
diff --git a/testsuite/texture-interp-bilinear/run.py b/testsuite/texture-interp-bilinear/run.py
index 0617c8319f..ee9dbed8b1 100755
--- a/testsuite/texture-interp-bilinear/run.py
+++ b/testsuite/texture-interp-bilinear/run.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # https://github.com/AcademySoftwareFoundation/OpenImageIO
 
+hardfail = 0.02
 
 command = testtex_command ("../common/textures/grid.tx",
                            extraargs = "-interpmode 1  -d uint8 -o out.tif")
diff --git a/testsuite/texture-skinny/run.py b/testsuite/texture-skinny/run.py
index 8d2f16c7d7..c61eb79d98 100755
--- a/testsuite/texture-skinny/run.py
+++ b/testsuite/texture-skinny/run.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # https://github.com/AcademySoftwareFoundation/OpenImageIO
 
+hardfail = 0.013
 
 command = testtex_command ("src/vertgrid.tx", " --scalest 4 1 ")
 outputs = [ "out.exr" ]
diff --git a/testsuite/texture-uint8/run.py b/testsuite/texture-uint8/run.py
index efe91e14c1..af786ad492 100755
--- a/testsuite/texture-uint8/run.py
+++ b/testsuite/texture-uint8/run.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # https://github.com/AcademySoftwareFoundation/OpenImageIO
 
+hardfail = 0.021
 
 command = testtex_command ("../common/textures/grid.tx")
 outputs = [ "out.exr" ]

From 65a46fc6a5ab88121f0d7d65c49aba0893120d4d Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Mon, 2 Feb 2026 12:30:05 -0800
Subject: [PATCH 44/70] fix: conform certain attrib names "exif:*" to our
 "Exif:*" convention (#5025)

I think it was basically harmless, since we do all the metadata name
comparisons using case-insensitive comparisons. But we use "Exif:" as
our prefix for Exif data throughout OIIO by convention, and there was
this tiny handful of places where we said "exif:".

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/jpeg.imageio/jpegoutput.cpp | 6 +++---
 src/libOpenImageIO/xmp.cpp      | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/jpeg.imageio/jpegoutput.cpp b/src/jpeg.imageio/jpegoutput.cpp
index f6058deb25..dc7a7af3fe 100644
--- a/src/jpeg.imageio/jpegoutput.cpp
+++ b/src/jpeg.imageio/jpegoutput.cpp
@@ -370,9 +370,9 @@ void
 JpgOutput::resmeta_to_density()
 {
     // Clear cruft from Exif that might confuse us
-    m_spec.erase_attribute("exif:XResolution");
-    m_spec.erase_attribute("exif:YResolution");
-    m_spec.erase_attribute("exif:ResolutionUnit");
+    m_spec.erase_attribute("Exif:XResolution");
+    m_spec.erase_attribute("Exif:YResolution");
+    m_spec.erase_attribute("Exif:ResolutionUnit");
 
     string_view resunit = m_spec.get_string_attribute("ResolutionUnit");
     if (Strutil::iequals(resunit, "none"))
diff --git a/src/libOpenImageIO/xmp.cpp b/src/libOpenImageIO/xmp.cpp
index bc1a777a5e..f919afbfd7 100644
--- a/src/libOpenImageIO/xmp.cpp
+++ b/src/libOpenImageIO/xmp.cpp
@@ -98,8 +98,8 @@ static XMPtag xmptag[] = {
     { "tiff:Software", "Software", TypeDesc::STRING, TiffRedundant },
 
     { "exif:ColorSpace", "Exif:ColorSpace", TypeDesc::INT, ExifRedundant },
-    { "exif:PixelXDimension", "", TypeDesc::INT, ExifRedundant|TiffRedundant},
-    { "exif:PixelYDimension", "", TypeDesc::INT, ExifRedundant|TiffRedundant },
+    { "exif:PixelXDimension", "Exif:PixelXDimension", TypeDesc::INT, ExifRedundant|TiffRedundant},
+    { "exif:PixelYDimension", "Exif:PixelYDimension", TypeDesc::INT, ExifRedundant|TiffRedundant },
     { "exifEX:PhotographicSensitivity", "Exif:ISOSpeedRatings", TypeDesc::INT, ExifRedundant },
 
     { "xmp:CreateDate", "DateTime", TypeDesc::STRING, DateConversion|TiffRedundant },

From 093793523bb494c11ff3d1c6521692e1a526af9b Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Mon, 2 Feb 2026 13:42:10 -0800
Subject: [PATCH 45/70] fix(win): `oiiotool --buildinfo` misreported platform
 on MSVS (#5027)

Need to test some MSVS-specific macros to determine what architecture to
report.

And especially, if it doesn't know the processor architecture, it still
should be *appending* that to the platform, not replacing it! This
caused MSVS-compiled OIIO on Windows to report "unknown arch?"

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imageio.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/libOpenImageIO/imageio.cpp b/src/libOpenImageIO/imageio.cpp
index aa8babf9b4..4873c0f09e 100644
--- a/src/libOpenImageIO/imageio.cpp
+++ b/src/libOpenImageIO/imageio.cpp
@@ -258,14 +258,15 @@ oiio_build_platform()
     platform = "UnknownOS";
 #endif
     platform += "/";
-#if defined(__x86_64__)
+#if defined(__x86_64__) || defined(_M_AMD64)
     platform += "x86_64";
-#elif defined(__i386__)
+#elif defined(__i386__) || defined(_M_IX86)
     platform += "i386";
-#elif defined(_M_ARM64) || defined(__aarch64__) || defined(__aarch64)
+#elif defined(_M_ARM64) || defined(__aarch64__) || defined(__aarch64) \
+    || defined(__ARM_ARCH)
     platform += "ARM";
 #else
-    platform = "unknown arch?";
+    platform += "unknown arch?";
 #endif
     return platform;
 }

From a13b5990f20f3499ca0cabfd878e9e6de85d645a Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Thu, 5 Feb 2026 19:08:29 -0800
Subject: [PATCH 46/70] testing: Add testsuite/heif ref output for libheif 1.21
 + avif support (#5031)

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .../heif/ref/out-libheif1.21-with-av1.txt     | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 testsuite/heif/ref/out-libheif1.21-with-av1.txt

diff --git a/testsuite/heif/ref/out-libheif1.21-with-av1.txt b/testsuite/heif/ref/out-libheif1.21-with-av1.txt
new file mode 100644
index 0000000000..b22dcca2c5
--- /dev/null
+++ b/testsuite/heif/ref/out-libheif1.21-with-av1.txt
@@ -0,0 +1,164 @@
+Reading ref/IMG_7702_small.heic
+ref/IMG_7702_small.heic :  512 x  300, 3 channel, uint8 heif
+    SHA-1: 2380C124F8338910013FEA75C9C64C23567A3156
+    channel list: R, G, B
+    DateTime: "2019:01:21 16:10:54"
+    ExposureTime: 0.030303
+    FNumber: 1.8
+    Make: "Apple"
+    Model: "iPhone 7"
+    Orientation: 1 (normal)
+    ResolutionUnit: 2 (inches)
+    Software: "12.1.2"
+    XResolution: 72
+    YResolution: 72
+    Exif:ApertureValue: 1.69599 (f/1.8)
+    Exif:BrightnessValue: 3.99501
+    Exif:ColorSpace: 65535
+    Exif:DateTimeDigitized: "2019:01:21 16:10:54"
+    Exif:DateTimeOriginal: "2019:01:21 16:10:54"
+    Exif:ExifVersion: "0221"
+    Exif:ExposureBiasValue: 0
+    Exif:ExposureMode: 0 (auto)
+    Exif:ExposureProgram: 2 (normal program)
+    Exif:Flash: 24 (no flash, auto flash)
+    Exif:FlashPixVersion: "0100"
+    Exif:FocalLength: 3.99 (3.99 mm)
+    Exif:FocalLengthIn35mmFilm: 28
+    Exif:LensMake: "Apple"
+    Exif:LensModel: "iPhone 7 back camera 3.99mm f/1.8"
+    Exif:LensSpecification: 3.99, 3.99, 1.8, 1.8
+    Exif:MeteringMode: 5 (pattern)
+    Exif:PhotographicSensitivity: 20
+    Exif:PixelXDimension: 4032
+    Exif:PixelYDimension: 3024
+    Exif:SceneCaptureType: 0 (standard)
+    Exif:SensingMethod: 2 (1-chip color area)
+    Exif:ShutterSpeedValue: 5.03599 (1/32 s)
+    Exif:SubsecTimeDigitized: "006"
+    Exif:SubsecTimeOriginal: "006"
+    Exif:WhiteBalance: 0 (auto)
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading ref/Chimera-AV1-8bit-162.avif
+ref/Chimera-AV1-8bit-162.avif :  480 x  270, 3 channel, uint8 heif
+    SHA-1: F8FDAF1BD56A21E3AF99CF8EE7FA45434D2826C7
+    channel list: R, G, B
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading ref/test-10bit.avif
+ref/test-10bit.avif  :   16 x   16, 4 channel, uint10 heif
+    SHA-1: A217653C4E10FEBF080E26F9FC78F572184B1FDA
+    channel list: R, G, B, A
+    Software: "OpenImageIO 3.2.0.0dev : B4BD496D92983E84F1FD621682CAB821C1E2126C"
+    Exif:ExifVersion: "0230"
+    Exif:FlashPixVersion: "0100"
+    Exif:ImageHistory: "oiiotool --pattern fill:topleft=1,0,0,1:topright=0,1,0,1:bottomleft=0,0,1,1:bottomright=1,1,1,1 16x16 4 -d uint16 -o test16.png"
+    heif:UnassociatedAlpha: 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading ../oiio-images/heif/greyhounds-looking-for-a-table.heic
+../oiio-images/heif/greyhounds-looking-for-a-table.heic : 3024 x 4032, 3 channel, uint8 heif
+    SHA-1: 8064B23A1A995B0D6525AFB5248EEC6C730BBB6C
+    channel list: R, G, B
+    DateTime: "2023:09:28 09:44:03"
+    ExposureTime: 0.0135135
+    FNumber: 2.4
+    Make: "Apple"
+    Model: "iPhone 12 Pro"
+    Orientation: 1 (normal)
+    ResolutionUnit: 2 (inches)
+    Software: "16.7"
+    XResolution: 72
+    YResolution: 72
+    Exif:ApertureValue: 2.52607 (f/2.4)
+    Exif:BrightnessValue: 2.7506
+    Exif:ColorSpace: 65535
+    Exif:CompositeImage: 2
+    Exif:DateTimeDigitized: "2023:09:28 09:44:03"
+    Exif:DateTimeOriginal: "2023:09:28 09:44:03"
+    Exif:DigitalZoomRatio: 1.3057
+    Exif:ExifVersion: "0232"
+    Exif:ExposureBiasValue: 0
+    Exif:ExposureMode: 0 (auto)
+    Exif:ExposureProgram: 2 (normal program)
+    Exif:Flash: 16 (no flash, flash suppression)
+    Exif:FocalLength: 1.54 (1.54 mm)
+    Exif:FocalLengthIn35mmFilm: 17
+    Exif:LensMake: "Apple"
+    Exif:LensModel: "iPhone 12 Pro back triple camera 1.54mm f/2.4"
+    Exif:LensSpecification: 1.54, 6, 1.6, 2.4
+    Exif:MeteringMode: 5 (pattern)
+    Exif:OffsetTime: "+02:00"
+    Exif:OffsetTimeDigitized: "+02:00"
+    Exif:OffsetTimeOriginal: "+02:00"
+    Exif:PhotographicSensitivity: 320
+    Exif:PixelXDimension: 4032
+    Exif:PixelYDimension: 3024
+    Exif:SensingMethod: 2 (1-chip color area)
+    Exif:ShutterSpeedValue: 6.20983 (1/74 s)
+    Exif:SubsecTimeDigitized: "886"
+    Exif:SubsecTimeOriginal: "886"
+    Exif:WhiteBalance: 0 (auto)
+    GPS:Altitude: 3.24105 (3.24105 m)
+    GPS:AltitudeRef: 0 (above sea level)
+    GPS:DateStamp: "2023:09:28"
+    GPS:DestBearing: 90.2729
+    GPS:DestBearingRef: "T" (true north)
+    GPS:HPositioningError: 5.1893
+    GPS:ImgDirection: 90.2729
+    GPS:ImgDirectionRef: "T" (true north)
+    GPS:Latitude: 41, 50, 58.43
+    GPS:LatitudeRef: "N"
+    GPS:Longitude: 3, 7, 31.98
+    GPS:LongitudeRef: "E"
+    GPS:Speed: 0.171966
+    GPS:SpeedRef: "K" (km/hour)
+    oiio:ColorSpace: "srgb_rec709_scene"
+    oiio:OriginalOrientation: 6
+Reading ../oiio-images/heif/sewing-threads.heic
+../oiio-images/heif/sewing-threads.heic : 4000 x 3000, 3 channel, uint8 heif
+    SHA-1: 44551A0A8AADD2C71B504681F2BAE3F7863EF9B9
+    channel list: R, G, B
+    DateTime: "2023:12:12 18:39:16"
+    ExposureTime: 0.04
+    FNumber: 1.8
+    Make: "samsung"
+    Model: "SM-A326B"
+    Orientation: 1 (normal)
+    ResolutionUnit: 2 (inches)
+    Software: "A326BXXS8CWK2"
+    XResolution: 72
+    YResolution: 72
+    Exif:ApertureValue: 1.69 (f/1.8)
+    Exif:BrightnessValue: 1.19
+    Exif:ColorSpace: 1
+    Exif:DateTimeDigitized: "2023:12:12 18:39:16"
+    Exif:DateTimeOriginal: "2023:12:12 18:39:16"
+    Exif:DigitalZoomRatio: 1
+    Exif:ExifVersion: "0220"
+    Exif:ExposureBiasValue: 0
+    Exif:ExposureMode: 0 (auto)
+    Exif:ExposureProgram: 2 (normal program)
+    Exif:Flash: 0 (no flash)
+    Exif:FocalLength: 4.6 (4.6 mm)
+    Exif:FocalLengthIn35mmFilm: 25
+    Exif:MaxApertureValue: 1.69 (f/1.8)
+    Exif:MeteringMode: 2 (center-weighted average)
+    Exif:OffsetTime: "+01:00"
+    Exif:OffsetTimeOriginal: "+01:00"
+    Exif:PhotographicSensitivity: 500
+    Exif:PixelXDimension: 4000
+    Exif:PixelYDimension: 3000
+    Exif:SceneCaptureType: 0 (standard)
+    Exif:ShutterSpeedValue: 0.04 (1/1 s)
+    Exif:SubsecTime: "576"
+    Exif:SubsecTimeDigitized: "576"
+    Exif:SubsecTimeOriginal: "576"
+    Exif:WhiteBalance: 0 (auto)
+    Exif:YCbCrPositioning: 1
+    GPS:Altitude: 292 (292 m)
+    GPS:AltitudeRef: 0 (above sea level)
+    GPS:Latitude: 41, 43, 33.821
+    GPS:LatitudeRef: "N"
+    GPS:Longitude: 1, 49, 34.0187
+    GPS:LongitudeRef: "E"
+    oiio:ColorSpace: "srgb_rec709_scene"

From 8251bba087ac19de58976cbbf4437ae910fe4b38 Mon Sep 17 00:00:00 2001
From: Jesse Yurkovich <jesse.y@gmail.com>
Date: Fri, 6 Feb 2026 13:55:43 -0800
Subject: [PATCH 47/70] cleanup: remove left over tile emulation code for
 various formats (#5029)

Since the OpenImageIO 2.5 series, when calls to `check_open` were added,
any format that did not declare support for "tiles" would immediately
fail to open. But many of the formats which attempted to emulate tiles,
by buffering the contents and writing it all as scanlines at the end,
were not updated. All of the tile emulation code for these formats is
effectively dead-code and untested.

Remove the tile emulation code from these formats.

An example of what the failure currently looks like:
```python
>>> out = oiio.ImageOutput.create("test.png")
>>> spec = oiio.ImageSpec(64, 64, 3, 'uint8')
>>> spec.tile_width = 64
>>> out.open("test.png", spec)
False

>>> out.geterror()
'png does not support tiled images'
```

No tests were impacted.

Signed-off-by: Jesse Yurkovich <jesse.y@gmail.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/bmp.imageio/bmpoutput.cpp     | 37 +------------------------------
 src/dpx.imageio/dpxoutput.cpp     | 36 +-----------------------------
 src/hdr.imageio/hdroutput.cpp     | 31 +-------------------------
 src/ico.imageio/icooutput.cpp     | 30 +------------------------
 src/jpeg.imageio/jpegoutput.cpp   | 33 ++-------------------------
 src/png.imageio/pngoutput.cpp     | 30 +------------------------
 src/pnm.imageio/pnmoutput.cpp     | 29 +-----------------------
 src/rla.imageio/rlaoutput.cpp     | 30 +------------------------
 src/targa.imageio/targaoutput.cpp | 31 +-------------------------
 src/zfile.imageio/zfile.cpp       | 36 +-----------------------------
 10 files changed, 11 insertions(+), 312 deletions(-)

diff --git a/src/bmp.imageio/bmpoutput.cpp b/src/bmp.imageio/bmpoutput.cpp
index c6e772799c..e60ae820ff 100644
--- a/src/bmp.imageio/bmpoutput.cpp
+++ b/src/bmp.imageio/bmpoutput.cpp
@@ -27,9 +27,6 @@ class BmpOutput final : public ImageOutput {
     bool close(void) override;
     bool write_scanline(int y, int z, TypeDesc format, const void* data,
                         stride_t xstride) override;
-    bool write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                    stride_t xstride, stride_t ystride,
-                    stride_t zstride) override;
 
 private:
     int64_t m_padded_scanline_size;
@@ -38,7 +35,6 @@ class BmpOutput final : public ImageOutput {
     bmp_pvt::DibInformationHeader m_dib_header;
     int64_t m_image_start;
     unsigned int m_dither;
-    std::vector<unsigned char> m_tilebuffer;
     std::vector<unsigned char> m_scratch;
     std::vector<unsigned char> m_buf;  // more tmp space for write_scanline
 
@@ -110,11 +106,6 @@ BmpOutput::open(const std::string& name, const ImageSpec& spec, OpenMode mode)
 
     m_image_start = iotell();
 
-    // If user asked for tiles -- which this format doesn't support, emulate
-    // it by buffering the whole image.
-    if (m_spec.tile_width && m_spec.tile_height)
-        m_tilebuffer.resize(m_spec.image_bytes());
-
     return true;
 }
 
@@ -157,23 +148,6 @@ BmpOutput::write_scanline(int y, int z, TypeDesc format, const void* data,
 }
 
 
-
-bool
-BmpOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                      stride_t xstride, stride_t ystride, stride_t zstride)
-{
-    if (!ioproxy_opened()) {
-        errorfmt("write_tile called but file is not open.");
-        return false;
-    }
-
-    // Emulate tiles by buffering the whole image
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, m_tilebuffer.data());
-}
-
-
-
 bool
 BmpOutput::close(void)
 {
@@ -182,17 +156,8 @@ BmpOutput::close(void)
         return true;
     }
 
-    bool ok = true;
-    if (m_spec.tile_width && m_tilebuffer.size()) {
-        // Handle tile emulation -- output the buffered pixels
-        OIIO_DASSERT(m_tilebuffer.size());
-        ok &= write_scanlines(m_spec.y, m_spec.y + m_spec.height, 0,
-                              m_spec.format, m_tilebuffer.data());
-        std::vector<unsigned char>().swap(m_tilebuffer);
-    }
-
     init();
-    return ok;
+    return true;
 }
 
 
diff --git a/src/dpx.imageio/dpxoutput.cpp b/src/dpx.imageio/dpxoutput.cpp
index 4db1103a3c..719a38da1a 100644
--- a/src/dpx.imageio/dpxoutput.cpp
+++ b/src/dpx.imageio/dpxoutput.cpp
@@ -46,9 +46,6 @@ class DPXOutput final : public ImageOutput {
     bool close() override;
     bool write_scanline(int y, int z, TypeDesc format, const void* data,
                         stride_t xstride) override;
-    bool write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                    stride_t xstride, stride_t ystride,
-                    stride_t zstride) override;
 
 private:
     OutStream* m_stream = nullptr;
@@ -69,7 +66,6 @@ class DPXOutput final : public ImageOutput {
     std::vector<ImageSpec> m_subimage_specs;
     bool m_write_pending;  // subimage buffer needs to be written
     unsigned int m_dither;
-    std::vector<unsigned char> m_tilebuffer;
 
     // Initialize private members to pre-opened state
     void init(void)
@@ -417,11 +413,6 @@ DPXOutput::open(const std::string& name, const ImageSpec& userspec,
                    ? spec0.get_int_attribute("oiio:dither", 0)
                    : 0;
 
-    // If user asked for tiles -- which this format doesn't support, emulate
-    // it by buffering the whole image.
-    if (spec0.tile_width && spec0.tile_height)
-        m_tilebuffer.resize(spec0.image_bytes());
-
     return prep_subimage(m_subimage, true);
 }
 
@@ -593,16 +584,7 @@ DPXOutput::close()
         return true;
     }
 
-    bool ok = true;
-    const ImageSpec& spec_s(m_subimage_specs[m_subimage]);
-    if (spec_s.tile_width && m_tilebuffer.size()) {
-        // Handle tile emulation -- output the buffered pixels
-        ok &= write_scanlines(spec_s.y, spec_s.y + spec_s.height, 0,
-                              spec_s.format, &m_tilebuffer[0]);
-        std::vector<unsigned char>().swap(m_tilebuffer);
-    }
-
-    ok &= write_buffer();
+    bool ok = write_buffer();
     m_dpx.Finish();
     init();  // Reset to initial state
     return ok;
@@ -644,22 +626,6 @@ DPXOutput::write_scanline(int y, int z, TypeDesc format, const void* data,
 
 
-bool
-DPXOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                      stride_t xstride, stride_t ystride, stride_t zstride)
-{
-    if (!is_opened()) {
-        errorfmt("write_tile called but file is not open.");
-        return false;
-    }
-
-    // Emulate tiles by buffering the whole image
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, &m_tilebuffer[0]);
-}
-
-
-
 dpx::Characteristic
 DPXOutput::get_characteristic_from_string(const std::string& str)
 {
diff --git a/src/hdr.imageio/hdroutput.cpp b/src/hdr.imageio/hdroutput.cpp
index 5feb97042a..e682a20079 100644
--- a/src/hdr.imageio/hdroutput.cpp
+++ b/src/hdr.imageio/hdroutput.cpp
@@ -26,14 +26,10 @@ class HdrOutput final : public ImageOutput {
               OpenMode mode) override;
     bool write_scanline(int y, int z, TypeDesc format, const void* data,
                         stride_t xstride) override;
-    bool write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                    stride_t xstride, stride_t ystride,
-                    stride_t zstride) override;
     bool close() override;
 
 private:
     std::vector<unsigned char> scratch;
-    std::vector<unsigned char> m_tilebuffer;
 
     void init(void) { ioproxy_clear(); }
 
@@ -226,11 +222,6 @@ HdrOutput::open(const std::string& name, const ImageSpec& newspec,
     if (!iowritefmt("-Y {} +X {}\n", m_spec.height, m_spec.width))
         return false;
 
-    // If user asked for tiles -- which this format doesn't support, emulate
-    // it by buffering the whole image.
-    if (m_spec.tile_width && m_spec.tile_height)
-        m_tilebuffer.resize(m_spec.image_bytes());
-
     return true;
 }
 
@@ -246,17 +237,6 @@ HdrOutput::write_scanline(int /*y*/, int /*z*/, TypeDesc format,
 
 
-bool
-HdrOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                      stride_t xstride, stride_t ystride, stride_t zstride)
-{
-    // Emulate tiles by buffering the whole image
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, &m_tilebuffer[0]);
-}
-
-
-
 bool
 HdrOutput::close()
 {
@@ -265,18 +245,9 @@ HdrOutput::close()
         return true;
     }
 
-    bool ok = true;
-    if (m_spec.tile_width) {
-        // We've been emulating tiles; now dump as scanlines.
-        OIIO_ASSERT(m_tilebuffer.size());
-        ok &= write_scanlines(m_spec.y, m_spec.y + m_spec.height, 0,
-                              m_spec.format, &m_tilebuffer[0]);
-        std::vector<unsigned char>().swap(m_tilebuffer);
-    }
-
     init();
 
-    return ok;
+    return true;
 }
 
 OIIO_PLUGIN_NAMESPACE_END
diff --git a/src/ico.imageio/icooutput.cpp b/src/ico.imageio/icooutput.cpp
index 18e5fdb5a3..b6f3eed9de 100644
--- a/src/ico.imageio/icooutput.cpp
+++ b/src/ico.imageio/icooutput.cpp
@@ -32,9 +32,6 @@ class ICOOutput final : public ImageOutput {
     bool close() override;
     bool write_scanline(int y, int z, TypeDesc format, const void* data,
                         stride_t xstride) override;
-    bool write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                    stride_t xstride, stride_t ystride,
-                    stride_t zstride) override;
 
 private:
     std::string m_filename;                ///< Stash the filename
@@ -47,7 +44,6 @@ class ICOOutput final : public ImageOutput {
     int m_and_slb;  ///< AND mask scanline length in bytes
     int m_bpp;      ///< Bits per pixel
     unsigned int m_dither;
-    std::vector<unsigned char> m_tilebuffer;
 
     png_structp m_png;  ///< PNG read structure pointer
     png_infop m_info;   ///< PNG image info structure pointer
@@ -361,11 +357,6 @@ ICOOutput::open(const std::string& name, const ImageSpec& userspec,
         fseek(m_file, m_offset + sizeof(bmi), SEEK_SET);
     }
 
-    // If user asked for tiles -- which this format doesn't support, emulate
-    // it by buffering the whole image.
-    if (m_spec.tile_width && m_spec.tile_height)
-        m_tilebuffer.resize(m_spec.image_bytes());
-
     return true;
 }
 
@@ -392,15 +383,6 @@ ICOOutput::close()
         return true;
     }
 
-    bool ok = true;
-    if (m_spec.tile_width) {
-        // Handle tile emulation -- output the buffered pixels
-        OIIO_ASSERT(m_tilebuffer.size());
-        ok &= write_scanlines(m_spec.y, m_spec.y + m_spec.height, 0,
-                              m_spec.format, &m_tilebuffer[0]);
-        std::vector<unsigned char>().swap(m_tilebuffer);
-    }
-
     if (m_png) {
         PNG_pvt::write_end(m_png, m_info);
         if (m_png || m_info)
@@ -411,7 +393,7 @@ ICOOutput::close()
     fclose(m_file);
     m_file = NULL;
     init();  // re-initialize
-    return ok;
+    return true;
 }
 
 
@@ -516,14 +498,4 @@ ICOOutput::write_scanline(int y, int z, TypeDesc format, const void* data,
 
 
-bool
-ICOOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                      stride_t xstride, stride_t ystride, stride_t zstride)
-{
-    // Emulate tiles by buffering the whole image
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, &m_tilebuffer[0]);
-}
-
-
 OIIO_PLUGIN_NAMESPACE_END
diff --git a/src/jpeg.imageio/jpegoutput.cpp b/src/jpeg.imageio/jpegoutput.cpp
index dc7a7af3fe..68156aad0d 100644
--- a/src/jpeg.imageio/jpegoutput.cpp
+++ b/src/jpeg.imageio/jpegoutput.cpp
@@ -40,9 +40,6 @@ class JpgOutput final : public ImageOutput {
               OpenMode mode = Create) override;
     bool write_scanline(int y, int z, TypeDesc format, const void* data,
                         stride_t xstride) override;
-    bool write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                    stride_t xstride, stride_t ystride,
-                    stride_t zstride) override;
     bool close() override;
     bool copy_image(ImageInput* in) override;
 
@@ -55,7 +52,7 @@ class JpgOutput final : public ImageOutput {
     struct jpeg_error_mgr c_jerr;
     jvirt_barray_ptr* m_copy_coeffs;
     struct jpeg_decompress_struct* m_copy_decompressor;
-    std::vector<unsigned char> m_tilebuffer;
+
     // m_outbuffer/m_outsize are used for jpeg-to-memory
     unsigned char* m_outbuffer = nullptr;
 #if OIIO_JPEG_LIB_VERSION >= 94
@@ -356,11 +353,6 @@ JpgOutput::open(const std::string& name, const ImageSpec& newspec,
 
     m_dither = m_spec.get_int_attribute("oiio:dither", 0);
 
-    // If user asked for tiles -- which JPEG doesn't support, emulate it by
-    // buffering the whole image.
-    if (m_spec.tile_width && m_spec.tile_height)
-        m_tilebuffer.resize(m_spec.image_bytes());
-
     return true;
 }
 
@@ -517,17 +509,6 @@ JpgOutput::write_scanline(int y, int z, TypeDesc format, const void* data,
 
 
-bool
-JpgOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                      stride_t xstride, stride_t ystride, stride_t zstride)
-{
-    // Emulate tiles by buffering the whole image
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, &m_tilebuffer[0]);
-}
-
-
-
 bool
 JpgOutput::close()
 {
@@ -536,16 +517,6 @@ JpgOutput::close()
         return true;
     }
 
-    bool ok = true;
-
-    if (m_spec.tile_width) {
-        // We've been emulating tiles; now dump as scanlines.
-        OIIO_DASSERT(m_tilebuffer.size());
-        ok &= write_scanlines(m_spec.y, m_spec.y + m_spec.height, 0,
-                              m_spec.format, &m_tilebuffer[0]);
-        std::vector<unsigned char>().swap(m_tilebuffer);  // free it
-    }
-
     if (m_next_scanline < spec().height && m_copy_coeffs == NULL) {
         // But if we've only written some scanlines, write the rest to avoid
         // errors
@@ -578,7 +549,7 @@ JpgOutput::close()
     }
 
     init();
-    return ok;
+    return true;
 }
 
 
diff --git a/src/png.imageio/pngoutput.cpp b/src/png.imageio/pngoutput.cpp
index 00e11947c9..7d06eca25f 100644
--- a/src/png.imageio/pngoutput.cpp
+++ b/src/png.imageio/pngoutput.cpp
@@ -38,9 +38,6 @@ class PNGOutput final : public ImageOutput {
     bool write_scanlines(int ybegin, int yend, int z, TypeDesc format,
                          const void* data, stride_t xstride = AutoStride,
                          stride_t ystride = AutoStride) override;
-    bool write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                    stride_t xstride, stride_t ystride,
-                    stride_t zstride) override;
 
 private:
     std::string m_filename;  ///< Stash the filename
@@ -55,7 +52,6 @@ class PNGOutput final : public ImageOutput {
     float m_gamma = 1.0f;   ///< Gamma to use for alpha conversion
     std::vector<unsigned char> m_scratch;
     std::vector<png_text> m_pngtext;
-    std::vector<unsigned char> m_tilebuffer;
     bool m_err = false;
 
     // Initialize private members to pre-opened state
@@ -240,11 +236,6 @@ PNGOutput::open(const std::string& name, const ImageSpec& userspec,
     m_convert_alpha = m_spec.alpha_channel != -1
                       && !m_spec.get_int_attribute("oiio:UnassociatedAlpha", 0);
 
-    // If user asked for tiles -- which this format doesn't support, emulate
-    // it by buffering the whole image.
-    if (m_spec.tile_width && m_spec.tile_height)
-        m_tilebuffer.resize(m_spec.image_bytes());
-
     return true;
 }
 
@@ -258,15 +249,6 @@ PNGOutput::close()
         return true;
     }
 
-    bool ok = true;
-    if (m_spec.tile_width) {
-        // Handle tile emulation -- output the buffered pixels
-        OIIO_ASSERT(m_tilebuffer.size());
-        ok &= write_scanlines(m_spec.y, m_spec.y + m_spec.height, 0,
-                              m_spec.format, &m_tilebuffer[0]);
-        std::vector<unsigned char>().swap(m_tilebuffer);
-    }
-
     if (m_png) {
         PNG_pvt::write_end(m_png, m_info);
         if (m_png || m_info)
@@ -276,7 +258,7 @@ PNGOutput::close()
     }
 
     init();  // re-initialize
-    return ok;
+    return true;
 }
 
 
@@ -457,14 +439,4 @@ PNGOutput::write_scanlines(int ybegin, int yend, int z, TypeDesc format,
 
 
-bool
-PNGOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                      stride_t xstride, stride_t ystride, stride_t zstride)
-{
-    // Emulate tiles by buffering the whole image
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, &m_tilebuffer[0]);
-}
-
-
 OIIO_PLUGIN_NAMESPACE_END
diff --git a/src/pnm.imageio/pnmoutput.cpp b/src/pnm.imageio/pnmoutput.cpp
index 54596a3afa..1bb94b9677 100644
--- a/src/pnm.imageio/pnmoutput.cpp
+++ b/src/pnm.imageio/pnmoutput.cpp
@@ -30,9 +30,6 @@ class PNMOutput final : public ImageOutput {
     bool write_scanlines(int ybegin, int yend, int z, TypeDesc format,
                          const void* data, stride_t xstride = AutoStride,
                          stride_t ystride = AutoStride) override;
-    bool write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                    stride_t xstride, stride_t ystride,
-                    stride_t zstride) override;
 
 private:
     std::string m_filename;  // Stash the filename
@@ -42,7 +39,6 @@ class PNMOutput final : public ImageOutput {
 
     unsigned int m_dither;
     std::vector<unsigned char> m_scratch;
-    std::vector<unsigned char> m_tilebuffer;
 
     void init(void) { ioproxy_clear(); }
 
@@ -310,10 +306,6 @@ PNMOutput::open(const std::string& name, const ImageSpec& userspec,
             ok &= iowritefmt("{}\n", scale);
         }
     }
-    // If user asked for tiles -- which this format doesn't support, emulate
-    // it by buffering the whole image.
-    if (m_spec.tile_width && m_spec.tile_height)
-        m_tilebuffer.resize(m_spec.image_bytes());
 
     return ok;
 }
@@ -326,17 +318,8 @@ PNMOutput::close()
     if (!ioproxy_opened())  // already closed
         return true;
 
-    bool ok = true;
-    if (m_spec.tile_width) {
-        // Handle tile emulation -- output the buffered pixels
-        OIIO_DASSERT(m_tilebuffer.size());
-        ok &= ImageOutput::write_scanlines(m_spec.y, m_spec.y + m_spec.height,
-                                           0, m_spec.format, &m_tilebuffer[0]);
-        m_tilebuffer.shrink_to_fit();
-    }
-
     init();
-    return ok;
+    return true;
 }
 
 
@@ -422,14 +405,4 @@ PNMOutput::write_scanlines(int ybegin, int yend, int z, TypeDesc format,
 
 
-bool
-PNMOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                      stride_t xstride, stride_t ystride, stride_t zstride)
-{
-    // Emulate tiles by buffering the whole image
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, &m_tilebuffer[0]);
-}
-
-
 OIIO_PLUGIN_NAMESPACE_END
diff --git a/src/rla.imageio/rlaoutput.cpp b/src/rla.imageio/rlaoutput.cpp
index fec2736d51..6afcad128f 100644
--- a/src/rla.imageio/rlaoutput.cpp
+++ b/src/rla.imageio/rlaoutput.cpp
@@ -35,16 +35,12 @@ class RLAOutput final : public ImageOutput {
     bool close() override;
     bool write_scanline(int y, int z, TypeDesc format, const void* data,
                         stride_t xstride) override;
-    bool write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                    stride_t xstride, stride_t ystride,
-                    stride_t zstride) override;
 
 private:
     std::vector<unsigned char> m_scratch;
     RLAHeader m_rla;                   ///< Wavefront RLA header
     std::vector<uint32_t> m_sot;       ///< Scanline offset table
     std::vector<unsigned char> m_rle;  ///< Run record buffer for RLE
-    std::vector<unsigned char> m_tilebuffer;
     unsigned int m_dither;
 
     // Initialize private members to pre-opened state
@@ -340,11 +336,6 @@ RLAOutput::open(const std::string& name, const ImageSpec& userspec,
     m_sot.resize(m_spec.height, (int32_t)0);
     write(&m_sot[0], m_sot.size());
 
-    // If user asked for tiles -- which this format doesn't support, emulate
-    // it by buffering the whole image.
-    if (m_spec.tile_width && m_spec.tile_height)
-        m_tilebuffer.resize(m_spec.image_bytes());
-
     return true;
 }
 
@@ -382,22 +373,13 @@ RLAOutput::close()
         return true;
     }
 
-    bool ok = true;
-    if (m_spec.tile_width) {
-        // Handle tile emulation -- output the buffered pixels
-        OIIO_DASSERT(m_tilebuffer.size());
-        ok &= write_scanlines(m_spec.y, m_spec.y + m_spec.height, 0,
-                              m_spec.format, &m_tilebuffer[0]);
-        std::vector<unsigned char>().swap(m_tilebuffer);
-    }
-
     // Now that all scanlines have been output, return to write the
     // correct scanline offset table to file and close the stream.
     ioseek(sizeof(RLAHeader));
     write(m_sot.data(), m_sot.size());
 
     init();  // re-initialize
-    return ok;
+    return true;
 }
 
 
@@ -545,14 +527,4 @@ RLAOutput::write_scanline(int y, int z, TypeDesc format, const void* data,
 
 
-bool
-RLAOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                      stride_t xstride, stride_t ystride, stride_t zstride)
-{
-    // Emulate tiles by buffering the whole image
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, &m_tilebuffer[0]);
-}
-
-
 OIIO_PLUGIN_NAMESPACE_END
diff --git a/src/targa.imageio/targaoutput.cpp b/src/targa.imageio/targaoutput.cpp
index b580da9907..ef75e2c0bb 100644
--- a/src/targa.imageio/targaoutput.cpp
+++ b/src/targa.imageio/targaoutput.cpp
@@ -37,9 +37,6 @@ class TGAOutput final : public ImageOutput {
     bool close() override;
     bool write_scanline(int y, int z, TypeDesc format, const void* data,
                         stride_t xstride) override;
-    bool write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                    stride_t xstride, stride_t ystride,
-                    stride_t zstride) override;
     bool set_thumbnail(const ImageBuf& thumb) override;
 
 private:
@@ -50,7 +47,6 @@ class TGAOutput final : public ImageOutput {
     std::vector<unsigned char> m_scratch;
     int m_idlen;  ///< Length of the TGA ID block
     unsigned int m_dither;
-    std::vector<unsigned char> m_tilebuffer;
     ImageBuf m_thumb;
 
     // Initialize private members to pre-opened state
@@ -239,11 +235,6 @@ TGAOutput::open(const std::string& name, const ImageSpec& userspec,
         }
     }
 
-    // If user asked for tiles -- which this format doesn't support, emulate
-    // it by buffering the whole image.
-    if (m_spec.tile_width && m_spec.tile_height)
-        m_tilebuffer.resize(m_spec.image_bytes());
-
     return true;
 }
 
@@ -421,16 +412,7 @@ TGAOutput::close()
         return true;
     }
 
-    bool ok = true;
-    if (m_spec.tile_width) {
-        // Handle tile emulation -- output the buffered pixels
-        OIIO_ASSERT(m_tilebuffer.size());
-        ok &= write_scanlines(m_spec.y, m_spec.y + m_spec.height, 0,
-                              m_spec.format, &m_tilebuffer[0]);
-        m_tilebuffer.shrink_to_fit();
-    }
-
-    ok &= write_tga20_data_fields();
+    bool ok = write_tga20_data_fields();
 
     init();  // re-initialize
     return ok;
@@ -681,17 +663,6 @@ TGAOutput::write_scanline(int y, int z, TypeDesc format, const void* data,
 
 
-bool
-TGAOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                      stride_t xstride, stride_t ystride, stride_t zstride)
-{
-    // Emulate tiles by buffering the whole image
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, &m_tilebuffer[0]);
-}
-
-
-
 bool
 TGAOutput::set_thumbnail(const ImageBuf& thumb)
 {
diff --git a/src/zfile.imageio/zfile.cpp b/src/zfile.imageio/zfile.cpp
index 65b3f4fc54..d2e326aa4a 100644
--- a/src/zfile.imageio/zfile.cpp
+++ b/src/zfile.imageio/zfile.cpp
@@ -92,16 +92,12 @@ class ZfileOutput final : public ImageOutput {
     bool close() override;
     bool write_scanline(int y, int z, TypeDesc format, const void* data,
                         stride_t xstride) override;
-    bool write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                    stride_t xstride, stride_t ystride,
-                    stride_t zstride) override;
 
 private:
     std::string m_filename;  ///< Stash the filename
     FILE* m_file;            ///< Open image handle for not compressed
     gzFile m_gz;             ///< Handle for compressed files
     std::vector<unsigned char> m_scratch;
-    std::vector<unsigned char> m_tilebuffer;
 
     bool opened() const { return m_file || m_gz; }
 
@@ -112,7 +108,6 @@ class ZfileOutput final : public ImageOutput {
         m_gz   = 0;
         m_filename.clear();
         m_scratch.clear();
-        m_tilebuffer.clear();
     }
 };
 
@@ -306,11 +301,6 @@ ZfileOutput::open(const std::string& name, const ImageSpec& userspec,
         return false;
     }
 
-    // If user asked for tiles -- which this format doesn't support, emulate
-    // it by buffering the whole image.this form
-    if (m_spec.tile_width && m_spec.tile_height)
-        m_tilebuffer.resize(m_spec.image_bytes());
-
     return true;
 }
 
@@ -324,15 +314,6 @@ ZfileOutput::close()
         return true;
     }
 
-    bool ok = true;
-    if (m_spec.tile_width && m_tilebuffer.size()) {
-        // We've been emulating tiles; now dump as scanlines.
-        ok &= write_scanlines(m_spec.y, m_spec.y + m_spec.height, 0,
-                              m_spec.format, m_tilebuffer.data());
-        m_tilebuffer.clear();
-        m_tilebuffer.shrink_to_fit();
-    }
-
     if (m_gz) {
         gzclose(m_gz);
         m_gz = 0;
@@ -343,7 +324,7 @@ ZfileOutput::close()
     }
 
     init();  // re-initialize
-    return ok;
+    return true;
 }
 
 
@@ -382,19 +363,4 @@ ZfileOutput::write_scanline(int y, int /*z*/, TypeDesc format, const void* data,
 
 
-bool
-ZfileOutput::write_tile(int x, int y, int z, TypeDesc format, const void* data,
-                        stride_t xstride, stride_t ystride, stride_t zstride)
-{
-    if (!opened()) {
-        errorfmt("File not open");
-        return false;
-    }
-    // Emulate tiles by buffering the whole image
-    OIIO_ASSERT(m_tilebuffer.data());
-    return copy_tile_to_image_buffer(x, y, z, format, data, xstride, ystride,
-                                     zstride, m_tilebuffer.data());
-}
-
-
 OIIO_PLUGIN_NAMESPACE_END

From d60bb81b6850308c82996d066a8f289aceba7bf4 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Fri, 6 Feb 2026 13:57:03 -0800
Subject: [PATCH 48/70] api: OIIO_CONTRACT_ASSERT and other hardening
 improvements (#5006)

Review: we have long had two assertion macros: OIIO_ASSERT which aborts
upon failure in Debug builds and prints but continues in Release builds,
and OIIO_DASSERT which aborts in Debug builds and is completely inactive
for Relase builds.

Inspired by C++26 contracts, and increasingly available "hardening
modes" in major compilers (especially with the LLVM/clang project's
libc++), I'm introducing some new verification helpers.

New macro `OIIO_CONTRACT_ASSERT` more closely mimics C++26
contract_assert in many ways, and perhaps will simply wrap C++
contract_assert when C++26 is on our menu.

Important ways that OIIO_CONTRACT_ASSERT differs from OIIO_ASSERT and
OIIO_DASSERT:

* Keeping in line with C++ contracts, there are 4 possible responses to
a failed contract assertion: Ignore, Observe (print only), Enforce
(print and abort) and Quick-Enforce (just abort).

* Also define hardening levels: None, Fast, Extensive, and Debug,
mimicking the levels of libc++. The idea is that maybe there will be
some CONTRACT_ASSERT checks you only want to do for certain hardening
levels.

* By default, the contract failure response is Enforce, unless it's both
a release build and the hardening level is set to None (in which case
the response will be Ignore). But it's also overrideable optionally on a
per-translation-unit basis by setting OIIO_ASSERTION_RESPONSE_DEFAULT
before any OIIO headers are included (though obviously that only applies
to inline functions or templates, not to any already-compiled code in
the library).

* Macros for explicit hardening levels: OIIO_HARDENING_ASSERT_FAST(),
EXTENSIVE(), and DEBUG(), which call CONTRACT_ASSERT only when the
hardening level is what's required or stricter.

I also changed the bounds checking in operator[] of string_view, span,
and image_span to use the contract assertions. Note that this adds a
tiny bit of overhead, since the default is "enforce" for release builds
(previously, using OIIO_DASSERT, it did no checks for release builds).
But the benchmarks seem to idicate that the perf difference is barely
measurable.

I added some benchmarking that proves that the bounds check adds a
minute overhead to an element access for a trivial `span<float>`, maybe
even indescernable. Here are benchmarks comparing raw pointer access,
std::array access, span access with the new checks, span access
carefully bypassing the tests.

Linux workstation, gcc-11, on my work computer:

    pointer operator[]:     647.8 ns (+/- 0.1ns)
    std::array operator[]:  647.8 ns (+/- 0.1ns)
    span operator[] :       657.6 ns (+/- 0.5ns)
    span unsafe indexing:   648.2 ns (+/- 0.2ns)
    span range      :       648.1 ns (+/- 0.1ns)

These are the most stable tests I have, with the least trial-to-trial
variation, and show about a 1.5% speed hit on the bounds-checked span
access itself, which I think will be truly un-measurable in the context
of being interleaved with any other operations that you do with the data
you pull from the span.

Mac Intel, Apple Clang 17, on my (old) personal laptop: (much more
variable timing, probably from MacOS scheduler quirks)

    pointer operator[]:     929.2 ns (+/- 6.7ns)
    std::array operator[]:  913.1 ns (+/- 20.6ns)
    span operator[] :       905.8 ns (+/- 13.3ns)
    span unsafe indexing:   913.9 ns (+/- 16.6ns)
    span range      :       916.4 ns (+/- 20.3ns)

You can see that here there is no obvious penalty, in fact it appears a
little faster, but all within the timing uncertainty of the multiple
trials, so statistically it's hard to discern any penalty.

And a couple more for good measure from our CI, but note that because
these are uncontrolled machines somewhere on the GitHub cloud, the
timings might not be as reliable:

Windows, MSVS 2022:

    pointer operator[]:    3716.3 ns (+/- 6.3ns)
    std::array operator[]: 3715.5 ns (+/- 3.4ns)
    span operator[] :      3715.6 ns (+/- 2.6ns)
    span unsafe indexing:  3712.1 ns (+/- 0.7ns)
    span range      :      3714.2 ns (+/- 2.9ns)

Linux, gcc-14, C++20:

    pointer operator[]:    1130.9 ns (+/- 0.2ns),  884.2 k/s
    std::array operator[]: 1132.0 ns (+/- 0.4ns),  883.4 k/s
    span operator[] :      1133.7 ns (+/- 0.4ns),  882.1 k/s
    span unsafe indexing:  1134.2 ns (+/- 1.6ns),  881.7 k/s
    span range      :      1133.9 ns (+/- 0.7ns),  881.9 k/s

MacOS ARM:

    pointer operator[]:    3456.6 ns (+/- 7.5ns)
    std::array operator[]: 3466.8 ns (+/- 12.2ns)
    span operator[] :      3610.9 ns (+/- 11.0ns)
    span unsafe indexing:  3607.4 ns (+/- 4.9ns)
    span range      :      3612.4 ns (+/- 12.2ns)

Windows with MSVS and Linux with newer g++ don't appear to show any
penalty, and the bracketing of trial times indicates that maybe it's
consistent enough to be meaningful? I can't think of anything I'm doing
wrong here that would throw off the timing or disable the range checking
on these tests.

For MacOS ARM, the span looks like it has about a 4% penalty versus raw
pointers? But OTOH, span bounds-checked vs non-checked vs range-for are
all the same, so maybe the speed vs raw pointer is something else
entirely?

Also please note that a preferred way to avoid these extra bounds checks
entirely is to change an index-oriented loop like

    span s;
    for (size_t i = 0; i < s.size(); ++i)
        foo(s[i]);   // maybe bounds check on each iteration?

to a range based loop:

    span s;
    for (auto& v : s)
        foo(v);

which should be inherently safe and require no in-loop checks at all.

---------

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/build-scripts/ci-benchmark.bash   |   7 +-
 src/cmake/compiler.cmake              |   6 +-
 src/include/OpenImageIO/dassert.h     | 152 ++++++++++++++++++++++++++
 src/include/OpenImageIO/hash.h        |   7 +-
 src/include/OpenImageIO/image_span.h  |  26 +++--
 src/include/OpenImageIO/span.h        |  14 ++-
 src/include/OpenImageIO/string_view.h |  22 ++--
 src/libutil/errorhandler.cpp          |  11 ++
 src/libutil/span_test.cpp             |  91 ++++++++++++++-
 src/libutil/strutil.cpp               |   2 +-
 10 files changed, 308 insertions(+), 30 deletions(-)

diff --git a/src/build-scripts/ci-benchmark.bash b/src/build-scripts/ci-benchmark.bash
index 7d5a5f0a33..d3b3551b1b 100755
--- a/src/build-scripts/ci-benchmark.bash
+++ b/src/build-scripts/ci-benchmark.bash
@@ -13,13 +13,14 @@ ls build
 ls $BUILD_BIN_DIR
 
 mkdir -p build/benchmarks
-for t in image_span_test ; do
+for t in image_span_test span_test ; do
     echo
     echo
     echo "$t"
     echo "========================================================"
-    ${BUILD_BIN_DIR}/$t > build/benchmarks/$t.out
-    cat build/benchmarks/$t.out
+    OpenImageIO_CI=0 ${BUILD_BIN_DIR}/$t | tee build/benchmarks/$t.out
+    # Note: set OpenImageIO_CI=0 to avoid CI-specific automatic reduction of
+    # the number of trials and iterations.
     echo "========================================================"
     echo "========================================================"
     echo
diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake
index 8dffb97d36..6936ac51b9 100644
--- a/src/cmake/compiler.cmake
+++ b/src/cmake/compiler.cmake
@@ -499,12 +499,12 @@ endif ()
 # https://cheatsheetseries.owasp.org/cheatsheets/C-Based_Toolchain_Hardening_Cheat_Sheet.html
 
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
-    set (${PROJ_NAME}_HARDENING_DEFAULT 2)
+    set (${PROJ_NAME}_HARDENING_DEFAULT 2)  # Extensive
 else ()
-    set (${PROJ_NAME}_HARDENING_DEFAULT 1)
+    set (${PROJ_NAME}_HARDENING_DEFAULT 1)  # Fast
 endif ()
 set_cache (${PROJ_NAME}_HARDENING ${${PROJ_NAME}_HARDENING_DEFAULT}
-           "Turn on security hardening features 0, 1, 2, 3")
+           "Turn on security hardening features 0=none, 1=fast, 2=extensive, 3=debug")
 # Implementation:
 add_compile_definitions (${PROJ_NAME}_HARDENING_DEFAULT=${${PROJ_NAME}_HARDENING})
 if (${PROJ_NAME}_HARDENING GREATER_EQUAL 1)
diff --git a/src/include/OpenImageIO/dassert.h b/src/include/OpenImageIO/dassert.h
index db7cf65976..db3e8992b9 100644
--- a/src/include/OpenImageIO/dassert.h
+++ b/src/include/OpenImageIO/dassert.h
@@ -9,9 +9,161 @@
 #include <cstdio>
 #include <cstdlib>
 
+#include <OpenImageIO/oiioversion.h>
 #include <OpenImageIO/platform.h>
 
 
+
+// General resources about security and hardening for C++:
+//
+// https://best.openssf.org/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C++.html
+// https://www.gnu.org/software/libc/manual/html_node/Source-Fortification.html
+// https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_macros.html
+// https://libcxx.llvm.org/Hardening.html
+// https://cheatsheetseries.owasp.org/cheatsheets/C-Based_Toolchain_Hardening_Cheat_Sheet.html
+// https://stackoverflow.com/questions/13544512/what-is-the-most-hardened-set-of-options-for-gcc-compiling-c-c
+// https://medium.com/@simontoth/daily-bit-e-of-c-hardened-mode-of-standard-library-implementations-18be2422c372
+// https://en.cppreference.com/w/cpp/contract
+// https://en.cppreference.com/w/cpp/language/contracts
+
+
+
+// Define hardening levels for OIIO: which checks should we do?
+// NONE - YOLO mode, no extra checks (not recommended)
+// FAST - Minimal checks that have low performance impact
+// EXTENSIVE - More thorough checks, may impact performance
+// DEBUG - Maximum checks, for debugging purposes
+#define OIIO_HARDENING_NONE 0
+#define OIIO_HARDENING_FAST 1
+#define OIIO_HARDENING_EXTENSIVE 2
+#define OIIO_HARDENING_DEBUG 3
+
+// OIIO_HARDENING_DEFAULT defines the default hardening level we actually use.
+// By default, we use FAST for release builds and DEBUG for debug builds. But
+// it can be overridden:
+// - For OIIO internals, at OIIO build time with the `OIIO_HARDENING` CMake
+//   variable.
+// - For other projects using OIIO's headers, any translation unit may
+//   override this by defining OIIO_HARDENING_DEFAULT before including any
+//   OIIO headers. But note that this only affects calls to inline functions
+//   or templates defined in the headers. Non-inline functions compiled into
+//   the OIIO library itself will have been compiled with whatever hardening
+//   level was selected when the library was built.
+#ifndef OIIO_HARDENING_DEFAULT
+#    ifdef NDEBUG
+#        define OIIO_HARDENING_DEFAULT OIIO_HARDENING_FAST
+#    else
+#        define OIIO_HARDENING_DEFAULT OIIO_HARDENING_DEBUG
+#    endif
+#endif
+
+
+// Choices for what to do when a contract assertion fails.
+// This mimics the C++26 standard's std::contract behavior.
+#define OIIO_ASSERTION_RESPONSE_IGNORE 0
+#define OIIO_ASSERTION_RESPONSE_OBSERVE 1
+#define OIIO_ASSERTION_RESPONSE_ENFORCE 2
+#define OIIO_ASSERTION_RESPONSE_QUICK_ENFORCE 3
+
+// OIIO_ASSERTION_RESPONSE_DEFAULT defines the default response to failed
+// contract assertions. By default, we enforce them, UNLESS we are a release
+// mode build that has set the hardening mode to NONE.  But any translation
+// unit (including clients of OIIO) may override this by defining
+// OIIO_ASSERTION_RESPONSE_DEFAULT before including any OIIO headers. But note
+// that this only affects calls to inline functions or templates defined in
+// the headers. Non-inline functions compiled into the OIIO library itself
+// will have been compiled with whatever response was selected when the
+// library was built.
+#ifndef OIIO_ASSERTION_RESPONSE_DEFAULT
+#    if OIIO_HARDENING_DEFAULT == OIIO_HARDENING_NONE && defined(NDEBUG)
+#        define OIIO_ASSERTION_RESPONSE_DEFAULT OIIO_ASSERTION_RESPONSE_IGNORE
+#    else
+#        define OIIO_ASSERTION_RESPONSE_DEFAULT OIIO_ASSERTION_RESPONSE_ENFORCE
+#    endif
+#endif
+
+
+
+// `OIIO_CONTRACT_ASSERT(condition)` checks if the condition is met, and if
+// not, calls the contract violation handler with the appropriate response.
+// `OIIO_CONTRACT_ASSERT_MSG(condition, msg)` is the same, but allows a
+// different message to be passed to the handler.
+#if OIIO_ASSERTION_RESPONSE_DEFAULT == OIIO_ASSERTION_RESPONSE_IGNORE
+#    define OIIO_CONTRACT_ASSERT_MSG(condition, message) (void)0
+#elif OIIO_ASSERTION_RESPONSE_DEFAULT == OIIO_ASSERTION_RESPONSE_QUICK_ENFORCE
+#    define OIIO_CONTRACT_ASSERT_MSG(condition, message) \
+        (OIIO_LIKELY(condition) ? ((void)0) : (std::abort(), (void)0))
+#elif OIIO_ASSERTION_RESPONSE_DEFAULT == OIIO_ASSERTION_RESPONSE_OBSERVE
+#    define OIIO_CONTRACT_ASSERT_MSG(condition, message)                      \
+        (OIIO_LIKELY(condition) ? ((void)0)                                   \
+                                : (OIIO::contract_violation_handler(          \
+                                       __FILE__ ":" OIIO_STRINGIZE(__LINE__), \
+                                       OIIO_PRETTY_FUNCTION, message),        \
+                                   (void)0))
+#elif OIIO_ASSERTION_RESPONSE_DEFAULT == OIIO_ASSERTION_RESPONSE_ENFORCE
+#    define OIIO_CONTRACT_ASSERT_MSG(condition, message)                      \
+        (OIIO_LIKELY(condition) ? ((void)0)                                   \
+                                : (OIIO::contract_violation_handler(          \
+                                       __FILE__ ":" OIIO_STRINGIZE(__LINE__), \
+                                       OIIO_PRETTY_FUNCTION, message),        \
+                                   std::abort(), (void)0))
+#else
+#    error "Unknown OIIO_ASSERTION_RESPONSE_DEFAULT"
+#endif
+
+#define OIIO_CONTRACT_ASSERT(condition) \
+    OIIO_CONTRACT_ASSERT_MSG(condition, #condition)
+
+// Macros to use to select whether or not to do a contract check, based on the
+// hardening level:
+// - OIIO_HARDENING_ASSERT_FAST : only checks contract for >= FAST hardening.
+// - OIIO_HARDENING_ASSERT_EXTENSIVE : only checks contract for >= EXTENSIVE.
+// - OIIO_HARDENING_ASSERT_DEBUG : only checks contract for DEBUG hardening.
+#if OIIO_HARDENING_DEFAULT >= OIIO_HARDENING_FAST
+#    define OIIO_HARDENING_ASSERT_FAST_MSG(condition, message) \
+        OIIO_CONTRACT_ASSERT_MSG(condition, message)
+#else
+#    define OIIO_HARDENING_ASSERT_FAST_MSG(...) (void)0
+#endif
+
+#if OIIO_HARDENING_DEFAULT >= OIIO_HARDENING_EXTENSIVE
+#    define OIIO_HARDENING_ASSERT_EXTENSIVE_MSG(condition, message) \
+        OIIO_CONTRACT_ASSERT_MSG(condition, message)
+#else
+#    define OIIO_HARDENING_ASSERT_EXTENSIVE_MSG(...) (void)0
+#endif
+
+#if OIIO_HARDENING_DEFAULT >= OIIO_HARDENING_DEBUG
+#    define OIIO_HARDENING_ASSERT_DEBUG_MSG(condition, message) \
+        OIIO_CONTRACT_ASSERT_MSG(condition, message)
+#else
+#    define OIIO_HARDENING_ASSERT_DEBUG_MSG(...) (void)0
+#endif
+
+#define OIIO_HARDENING_ASSERT_NONE(condition) \
+    OIIO_HARDENING_ASSERT_NONE_MSG(condition, #condition)
+#define OIIO_HARDENING_ASSERT_FAST(condition) \
+    OIIO_HARDENING_ASSERT_FAST_MSG(condition, #condition)
+#define OIIO_HARDENING_ASSERT_EXTENSIVE(condition) \
+    OIIO_HARDENING_ASSERT_EXTENSIVE_MSG(condition, #condition)
+#define OIIO_HARDENING_ASSERT_DEBUG(condition) \
+    OIIO_HARDENING_ASSERT_DEBUG_MSG(condition, #condition)
+
+
+OIIO_NAMESPACE_3_1_BEGIN
+// Internal contract assertion handler
+OIIO_UTIL_API void
+contract_violation_handler(const char* location, const char* function,
+                           const char* msg = "");
+OIIO_NAMESPACE_3_1_END
+
+OIIO_NAMESPACE_BEGIN
+#ifndef OIIO_DOXYGEN
+using v3_1::contract_violation_handler;
+#endif
+OIIO_NAMESPACE_END
+
+
 /// OIIO_ABORT_IF_DEBUG is a call to abort() for debug builds, but does
 /// nothing for release builds.
 #ifndef NDEBUG
diff --git a/src/include/OpenImageIO/hash.h b/src/include/OpenImageIO/hash.h
index 8424869a97..9005040b86 100644
--- a/src/include/OpenImageIO/hash.h
+++ b/src/include/OpenImageIO/hash.h
@@ -247,8 +247,11 @@ strhash (string_view s)
     size_t len = s.length();
     if (! len) return 0;
     unsigned int h = 0;
-    for (size_t i = 0;  i < len;  ++i) {
-        h += (unsigned char)(s[i]);
+    for (auto c : s) {
+        // Note: by using range for here, instead of looping over indices and
+        // calling operator[] to get each char, we avoid the bounds checking
+        // that operator[] does.
+        h += (unsigned char)(c);
         h += h << 10;
         h ^= h >> 6;
     }
diff --git a/src/include/OpenImageIO/image_span.h b/src/include/OpenImageIO/image_span.h
index 2684c15504..0ee7d2dcaf 100644
--- a/src/include/OpenImageIO/image_span.h
+++ b/src/include/OpenImageIO/image_span.h
@@ -271,18 +271,30 @@ template<typename T, size_t Rank = 4> class image_span {
     /// Return a pointer to the value at channel c, pixel (x,y,z).
     inline T* getptr(int c, int x, int y, int z = 0) const
     {
-        // Bounds check in debug mode
-        OIIO_DASSERT(unsigned(c) < unsigned(nchannels())
-                     && unsigned(x) < unsigned(width())
-                     && unsigned(y) < unsigned(height())
-                     && unsigned(z) < unsigned(depth()));
         if constexpr (Rank == 2) {
             OIIO_DASSERT(y == 0 && z == 0);
+            OIIO_CONTRACT_ASSERT(unsigned(c) < unsigned(nchannels())
+                                 && unsigned(x) < unsigned(width()));
+            return (T*)((char*)data() + c * chanstride());
         } else if constexpr (Rank == 3) {
             OIIO_DASSERT(z == 0);
+            OIIO_CONTRACT_ASSERT(unsigned(c) < unsigned(nchannels())
+                                 && unsigned(x) < unsigned(width())
+                                 && unsigned(y) < unsigned(height()));
+            return (T*)((char*)data() + c * chanstride() + x * xstride()
+                        + y * ystride());
+        } else {
+            // Rank == 4
+            OIIO_CONTRACT_ASSERT(unsigned(c) < unsigned(nchannels())
+                                 && unsigned(x) < unsigned(width())
+                                 && unsigned(y) < unsigned(height())
+                                 && unsigned(z) < unsigned(depth()));
+            return (T*)((char*)data() + c * chanstride() + x * xstride()
+                        + y * ystride() + z * zstride());
         }
-        return (T*)((char*)data() + c * chanstride() + x * xstride()
-                    + y * ystride() + z * zstride());
+#ifdef __INTEL_COMPILER
+        return nullptr;  // should never get here, but icc is confused
+#endif
     }
 
     /// Return a pointer to the value at channel 0, pixel (x,y,z).
diff --git a/src/include/OpenImageIO/span.h b/src/include/OpenImageIO/span.h
index f1c49dafdc..b184809a75 100644
--- a/src/include/OpenImageIO/span.h
+++ b/src/include/OpenImageIO/span.h
@@ -207,28 +207,28 @@ class span {
     /// optimized builds, there is no bounds check.  Note: this is different
     /// from C++ std::span, which never bounds checks `operator[]`.
     constexpr reference operator[] (size_type idx) const {
-        OIIO_DASSERT(idx < m_size && "OIIO::span::operator[] range check");
+        OIIO_CONTRACT_ASSERT(idx < m_size);
         return m_data[idx];
     }
     constexpr reference operator() (size_type idx) const {
-        OIIO_DASSERT(idx < m_size && "OIIO::span::operator() range check");
+        OIIO_CONTRACT_ASSERT(idx < m_size);
         return m_data[idx];
     }
     /// Bounds-checked access, throws an assertion if out of range.
     reference at (size_type idx) const {
         if (idx >= size())
-            throw (std::out_of_range ("OpenImageIO::span::at"));
+            throw (std::out_of_range ("OIIO::span::at"));
         return m_data[idx];
     }
 
     /// The first element of the span.
     constexpr reference front() const noexcept {
-        OIIO_DASSERT(m_size >= 1);
+        OIIO_CONTRACT_ASSERT(m_size >= 1);
         return m_data[0];
     }
     /// The last element of the span.
     constexpr reference back() const noexcept {
-        OIIO_DASSERT(m_size >= 1);
+        OIIO_CONTRACT_ASSERT(m_size >= 1);
         return m_data[size() - 1];
     }
 
@@ -374,14 +374,16 @@ class span_strided {
     constexpr stride_type stride() const noexcept { return m_stride; }
 
     constexpr reference operator[] (size_type idx) const {
+        OIIO_CONTRACT_ASSERT(idx < m_size);
         return m_data[m_stride*idx];
     }
     constexpr reference operator() (size_type idx) const {
+        OIIO_CONTRACT_ASSERT(idx < m_size);
         return m_data[m_stride*idx];
     }
     reference at (size_type idx) const {
         if (idx >= size())
-            throw (std::out_of_range ("OpenImageIO::span_strided::at"));
+            throw (std::out_of_range ("OIIO::span_strided::at"));
         return m_data[m_stride*idx];
     }
     constexpr reference front() const noexcept { return m_data[0]; }
diff --git a/src/include/OpenImageIO/string_view.h b/src/include/OpenImageIO/string_view.h
index b959bb9d5d..d07854ceb6 100644
--- a/src/include/OpenImageIO/string_view.h
+++ b/src/include/OpenImageIO/string_view.h
@@ -14,6 +14,7 @@
 #include <stdexcept>
 #include <string>
 
+#include <OpenImageIO/dassert.h>
 #include <OpenImageIO/export.h>
 #include <OpenImageIO/oiioversion.h>
 #include <OpenImageIO/platform.h>
@@ -205,11 +206,12 @@ class basic_string_view {
     /// Is the view empty, containing no characters?
     constexpr bool empty() const noexcept { return m_len == 0; }
 
-    /// Element access of an individual character. For debug build, does
-    /// bounds check with assertion. For optimized builds, there is no bounds
-    /// check.  Note: this is different from C++ std::span, which never bounds
-    /// checks `operator[]`.
-    constexpr const_reference operator[](size_type pos) const { return m_chars[pos]; }
+    /// Element access of an individual character. For debug or hardened
+    /// builds, does bounds check with assertion.
+    constexpr const_reference operator[](size_type pos) const {
+        OIIO_CONTRACT_ASSERT(pos < m_len);
+        return m_chars[pos];
+    }
     /// Element access with bounds checking and exception if out of bounds.
     constexpr const_reference at(size_t pos) const
     {
@@ -218,9 +220,15 @@ class basic_string_view {
         return m_chars[pos];
     }
     /// The first character of the view.
-    constexpr const_reference front() const { return m_chars[0]; }
+    constexpr const_reference front() const {
+        OIIO_CONTRACT_ASSERT(m_len >= 1);
+        return m_chars[0];
+    }
     /// The last character of the view.
-    constexpr const_reference back() const { return m_chars[m_len - 1]; }
+    constexpr const_reference back() const {
+        OIIO_CONTRACT_ASSERT(m_len >= 1);
+        return m_chars[m_len - 1];
+    }
     /// Return the underlying data pointer to the first character.
     constexpr const_pointer data() const noexcept { return m_chars; }
 
diff --git a/src/libutil/errorhandler.cpp b/src/libutil/errorhandler.cpp
index d0d1a2c385..417ed2aca8 100644
--- a/src/libutil/errorhandler.cpp
+++ b/src/libutil/errorhandler.cpp
@@ -58,4 +58,15 @@ ErrorHandler::operator()(int errcode, const std::string& msg)
     fflush(stderr);
 }
 
+
+
+void
+contract_violation_handler(const char* location, const char* function,
+                           const char* msg)
+{
+    Strutil::print(stderr, "{} ({}): Contract assertion failed: {}\n", location,
+                   function, msg ? msg : "");
+    fflush(stderr);
+}
+
 OIIO_NAMESPACE_3_1_END
diff --git a/src/libutil/span_test.cpp b/src/libutil/span_test.cpp
index 041f59644b..2e030214e4 100644
--- a/src/libutil/span_test.cpp
+++ b/src/libutil/span_test.cpp
@@ -6,6 +6,8 @@
 #include <iostream>
 #include <vector>
 
+#include <OpenImageIO/argparse.h>
+#include <OpenImageIO/benchmark.h>
 #include <OpenImageIO/image_span.h>
 #include <OpenImageIO/span.h>
 #include <OpenImageIO/strided_ptr.h>
@@ -14,6 +16,34 @@
 using namespace OIIO;
 
 
+static int iterations = 100000;
+static int ntrials    = 5;
+
+// Intentionally not static so the compiler can't optimize away its value
+int Nlen_unknown = 0;
+
+
+
+static void
+getargs(int argc, char* argv[])
+{
+    ArgParse ap;
+    ap.intro(
+          "span_test -- unit test and spans for OpenImageIO/span.h\n" OIIO_INTRO_STRING)
+        .usage("span_test [options]");
+
+    ap.arg("--iters %d", &iterations)
+        .help(Strutil::fmt::format("Number of iterations (default: {})",
+                                   iterations));
+    ap.arg("--trials %d", &ntrials).help("Number of trials");
+
+    // Fake option to hide from compiler how big it will be
+    ap.arg("--unknown %d", &Nlen_unknown).hidden();
+
+    ap.parse_args(argc, (const char**)argv);
+}
+
+
 
 void
 test_span()
@@ -457,9 +487,67 @@ test_spanzero()
 
 
+void
+benchmark_span()
+{
+    Benchmarker bench;
+    bench.iterations(iterations).trials(ntrials);
+    const size_t N = 1000;
+    // bench.work(N);
+    std::array<float, N> fstdarr;
+    std::fill(fstdarr.begin(), fstdarr.end(), 1.0f);
+    size_t Nlen = Nlen_unknown ? size_t(Nlen_unknown) : N;
+    bench("pointer operator[]", [&]() {
+        float* fptr(fstdarr.data());
+        float t = 0.0f;
+        for (size_t i = 0; i < Nlen; ++i)
+            DoNotOptimize(t += fptr[i]);
+    });
+    bench("std::array operator[]", [&]() {
+        float t = 0.0f;
+        for (size_t i = 0; i < Nlen; ++i)
+            DoNotOptimize(t += fstdarr[i]);
+    });
+    bench("span operator[]", [&]() {
+        span<float> fspan(fstdarr);
+        float t = 0.0f;
+        for (size_t i = 0; i < Nlen; ++i)
+            DoNotOptimize(t += fspan[i]);
+    });
+    bench("span unsafe indexing", [&]() {
+        span<float> fspan(fstdarr);
+        float t = 0.0f;
+        for (size_t i = 0; i < Nlen; ++i)
+            DoNotOptimize(t += fspan.data()[i]);
+    });
+    bench("span range", [&]() {
+        span<float> fspan(fstdarr);
+        float t = 0.0f;
+        for (auto x : fspan)
+            DoNotOptimize(t += x);
+    });
+}
+
+
+
 int
-main(int /*argc*/, char* /*argv*/[])
+main(int argc, char* argv[])
 {
+    // For the sake of test time, reduce the default number of benchmarking
+    // trials and iterations for DEBUG, CI, and code coverage builds. Explicit
+    // use of --iters or --trials will override this, since it comes before
+    // the getargs() call.
+    if (Strutil::eval_as_bool(Sysutil::getenv("OpenImageIO_CI"))
+#if !defined(NDEBUG) || defined(OIIO_CODE_COVERAGE)
+        || true
+#endif
+    ) {
+        iterations /= 10;
+        ntrials = 1;
+    }
+
+    getargs(argc, argv);
+
     test_span();
     test_span_mutable();
     test_span_initlist();
@@ -475,6 +563,7 @@ main(int /*argc*/, char* /*argv*/[])
     test_spancpy();
     test_spanset();
     test_spanzero();
+    benchmark_span();
 
     return unit_test_failures;
 }
diff --git a/src/libutil/strutil.cpp b/src/libutil/strutil.cpp
index 17060728d7..dc053df65e 100644
--- a/src/libutil/strutil.cpp
+++ b/src/libutil/strutil.cpp
@@ -152,7 +152,7 @@ c_str(string_view str)
     // in C++17 string_view. So maybe we'll find ourselves relying on it a
     // lot less, and therefore the performance hit of doing it foolproof
     // won't be as onerous.
-    if (str[str.size()] == 0)  // 0-terminated
+    if (str.data()[str.size()] == 0)  // 0-terminated
         return str.data();
 
     // Rare case: may not be 0-terminated. Bite the bullet and construct a

From d287b87a507cd421bdc39768f0edc63e3618e3d5 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Fri, 6 Feb 2026 22:10:51 -0800
Subject: [PATCH 49/70] ci: Don't install OpenCV on Mac Intel job variant
 (#5032)

Mac Intel is getting long in the tooth, and quite often the Homebrew
packages for Intel are found to be uncached and will try to build from
source. When it's OpenCV, that's disastrous for our CI build times, it
can get stalled for hours building all of OpenCV and its dependencies.
So disable it for that one build variant.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml                     | 4 +++-
 src/build-scripts/install_homebrew_deps.bash | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a227cdbd7b..83cf2b0f3e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -628,7 +628,9 @@ jobs:
             python_ver: "3.13"
             simd: sse4.2,avx2
             ctest_test_timeout: 1200
-            setenvs: export MACOSX_DEPLOYMENT_TARGET=12.0 INSTALL_QT=0
+            setenvs: export MACOSX_DEPLOYMENT_TARGET=12.0
+                            INSTALL_QT=0 INSTALL_OPENCV=0
+            optional_deps_append: 'OpenCV;Qt5;Qt6'
             benchmark: 1
           - desc: MacOS-14-ARM aclang15/C++20/py3.13
             runner: macos-14
diff --git a/src/build-scripts/install_homebrew_deps.bash b/src/build-scripts/install_homebrew_deps.bash
index 62a433e614..e8cfe2bc59 100755
--- a/src/build-scripts/install_homebrew_deps.bash
+++ b/src/build-scripts/install_homebrew_deps.bash
@@ -47,7 +47,7 @@ if [[ "$OIIO_BREW_INSTALL_PACKAGES" == "" ]] ; then
         robin-map \
         tbb \
         "
-    if [[ "${USE_OPENCV}" != "0" ]] && [[ "${INSTALL_OPENCV:=1}" != "0" ]] ; then
+    if [[ "${USE_OPENCV:=}" != "0" ]] && [[ "${INSTALL_OPENCV:=1}" != "0" ]] ; then
         OIIO_BREW_INSTALL_PACKAGES+=" opencv"
     fi
     if [[ "${USE_QT:=1}" != "0" ]] && [[ "${INSTALL_QT:=1}" != "0" ]] ; then

From adb8b37b96518690a927b4869d232c6ebaa38cef Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 7 Feb 2026 03:04:06 -0800
Subject: [PATCH 50/70] fix(bmp): detect corrupt files where palette doesn't
 match bpp (#5030)

Extra protections for corrupted BMP files that claim to be palette
images, but have a BPP that doesn't support palette images. Also an
extra guard around accessing the palette array if it is empty.

Add an extra test case for this kind of corruption.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/bmp.imageio/bmpinput.cpp               |  12 ++++++++++--
 testsuite/bmp/ref/out.txt                  |   3 +++
 testsuite/bmp/run.py                       |   1 +
 testsuite/bmp/src/palette32bit-corrupt.bmp | Bin 0 -> 67 bytes
 4 files changed, 14 insertions(+), 2 deletions(-)
 create mode 100644 testsuite/bmp/src/palette32bit-corrupt.bmp

diff --git a/src/bmp.imageio/bmpinput.cpp b/src/bmp.imageio/bmpinput.cpp
index a39d0b2061..ba43d4fc5f 100644
--- a/src/bmp.imageio/bmpinput.cpp
+++ b/src/bmp.imageio/bmpinput.cpp
@@ -259,6 +259,13 @@ BmpInput::open(const std::string& name, ImageSpec& newspec,
     case WINDOWS_V5: m_spec.attribute("bmp:version", 5); break;
     }
 
+    if (m_dib_header.cpalete && !m_colortable.size()) {
+        errorfmt(
+            "BMP error: bad BPP ({}) for palette image -- presumed corrupt file",
+            m_dib_header.bpp);
+        return false;
+    }
+
     // Default presumption is that a BMP file is meant to look reasonable on a
     // display, so assume it's sRGB. This is not really correct -- see the
     // comments below.
@@ -391,8 +398,9 @@ BmpInput::read_native_scanline(int subimage, int miplevel, int y, int /*z*/,
 
     size_t scanline_bytes = m_spec.scanline_bytes();
     uint8_t* mscanline    = (uint8_t*)data;
-    if (m_dib_header.compression == RLE4_COMPRESSION
-        || m_dib_header.compression == RLE8_COMPRESSION) {
+    if ((m_dib_header.compression == RLE4_COMPRESSION
+         || m_dib_header.compression == RLE8_COMPRESSION)
+        && m_colortable.size()) {
         for (int x = 0; x < m_spec.width; ++x) {
             int p = m_uncompressed[(m_spec.height - 1 - y) * m_spec.width + x];
             auto& c              = colortable(p);
diff --git a/testsuite/bmp/ref/out.txt b/testsuite/bmp/ref/out.txt
index 13fc651063..2f7323ea0c 100644
--- a/testsuite/bmp/ref/out.txt
+++ b/testsuite/bmp/ref/out.txt
@@ -298,3 +298,6 @@ oiiotool ERROR: read : "src/bad-y.bmp": BMP might be corrupted, it is referencin
 BMP error reading rle-compressed image
 Full command line was:
 > oiiotool --info -v -a --hash src/bad-y.bmp
+oiiotool ERROR: read : "src/palette32bit-corrupt.bmp": BMP error: bad BPP (32) for palette image -- presumed corrupt file
+Full command line was:
+> oiiotool --info -v -a --hash src/palette32bit-corrupt.bmp
diff --git a/testsuite/bmp/run.py b/testsuite/bmp/run.py
index 0e53937bdf..5f49b64682 100755
--- a/testsuite/bmp/run.py
+++ b/testsuite/bmp/run.py
@@ -33,3 +33,4 @@
 # See if we handle these corrupt files with useful error messages
 command += info_command ("src/decodecolormap-corrupt.bmp")
 command += info_command ("src/bad-y.bmp")
+command += info_command ("src/palette32bit-corrupt.bmp")
diff --git a/testsuite/bmp/src/palette32bit-corrupt.bmp b/testsuite/bmp/src/palette32bit-corrupt.bmp
new file mode 100644
index 0000000000000000000000000000000000000000..1f140906b4df76395903521430ce90e38010e5f8
GIT binary patch
literal 67
zcmZ?rm0@Q912Yx|1`Qyq9*7-)n2|vNh#453F#$<_AcKJ+9mEHMYu7;1Obq`S7}$3H
F2LQ9{2f_dV

literal 0
HcmV?d00001


From 527aee2e6365addad162a95bb8c5d6823ca6c97e Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Tue, 10 Feb 2026 17:14:57 -0800
Subject: [PATCH 51/70] fix(tiff): Fix TIFF output crash for multi-count Exif
 metadata (#5035)

Fixes #5023

This was crashing when writing TIFF information that was supposed to be
arrays of more than one rational, but in fact was provided as a single
value, it was reading past the end of a memory array.

I noticed that this whole region needs a cleanup, this is not the only
problem. But a full overhaul seems too risky to backport, so my strategy
is as follows:

* THIS fix first, which I will backport right away to 3.0 and 3.1.

* I will then submit a separate PR (already implemented and tested) that
is a much more complete fix and overhaul of this portion of the code
(and other places). That will get merged into main when approved.

* After the second PR is merged, I'll hold it in main for a while to
test its safety, and then decide if it seems ok to backport to 3.1 (but
definitely not 3.0).

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/tiff.imageio/tiffoutput.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tiff.imageio/tiffoutput.cpp b/src/tiff.imageio/tiffoutput.cpp
index efac72345b..77657329de 100644
--- a/src/tiff.imageio/tiffoutput.cpp
+++ b/src/tiff.imageio/tiffoutput.cpp
@@ -1141,16 +1141,16 @@ TIFFOutput::write_exif_data()
             if (tifftype == TIFF_ASCII) {
                 ok = TIFFSetField(m_tif, tag, *(char**)p.data());
             } else if ((tifftype == TIFF_SHORT || tifftype == TIFF_LONG)
-                       && p.type() == TypeDesc::SHORT) {
+                       && p.type() == TypeDesc::SHORT && count == 1) {
                 ok = TIFFSetField(m_tif, tag, (int)*(short*)p.data());
             } else if ((tifftype == TIFF_SHORT || tifftype == TIFF_LONG)
-                       && p.type() == TypeDesc::INT) {
+                       && p.type() == TypeDesc::INT && count == 1) {
                 ok = TIFFSetField(m_tif, tag, *(int*)p.data());
             } else if ((tifftype == TIFF_RATIONAL || tifftype == TIFF_SRATIONAL)
-                       && p.type() == TypeDesc::FLOAT) {
+                       && p.type() == TypeDesc::FLOAT && count == 1) {
                 ok = TIFFSetField(m_tif, tag, *(float*)p.data());
             } else if ((tifftype == TIFF_RATIONAL || tifftype == TIFF_SRATIONAL)
-                       && p.type() == TypeDesc::DOUBLE) {
+                       && p.type() == TypeDesc::DOUBLE && count == 1) {
                 ok = TIFFSetField(m_tif, tag, *(double*)p.data());
             }
             if (!ok) {

From 289d454cf29dd01ac8ab62fa3ab8eab7fa9416b7 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Tue, 10 Feb 2026 21:06:41 -0800
Subject: [PATCH 52/70] build: Raise fmt auto-build version to 12.1, handle
 Windows flags (#5039)

Bump the version of 'fmt' library that we download and build (if not
found) from 10.2 to 12.1.

Some other touch-ups in build_fmt.cmake.

Also, we have seen that recent fmt versions will fail to compile on MSVC
unless using the `/utf-8` compiler flag, so ensure that is used and also
passed on to other clients of libOpenImageIO_Util (which expose
templates using those headers).

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/cmake/build_fmt.cmake  | 6 ++----
 src/libutil/CMakeLists.txt | 6 ++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/cmake/build_fmt.cmake b/src/cmake/build_fmt.cmake
index 95a497c70a..2b41502e55 100644
--- a/src/cmake/build_fmt.cmake
+++ b/src/cmake/build_fmt.cmake
@@ -6,7 +6,7 @@
 # fmt by hand!
 ######################################################################
 
-set_cache (fmt_BUILD_VERSION 10.2.1 "fmt version for local builds")
+set_cache (fmt_BUILD_VERSION 12.1.0 "fmt version for local builds")
 set (fmt_GIT_REPOSITORY "https://github.com/fmtlib/fmt")
 set (fmt_GIT_TAG "${fmt_BUILD_VERSION}")
 # Note: fmt doesn't put "v" in front of version for its git tags
@@ -22,8 +22,6 @@ build_dependency_with_cmake(fmt
         -D FMT_TEST=OFF
     )
 
-# Set some things up that we'll need for a subsequent find_package to work
-set (fmt_ROOT ${fmt_INSTALL_DIR})
-
 # Signal to caller that we need to find again at the installed location
 set (fmt_REFIND TRUE)
+set (fmt_VERSION ${fmt_BUILD_VERSION})
diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt
index 526aa6f023..2a159e449b 100644
--- a/src/libutil/CMakeLists.txt
+++ b/src/libutil/CMakeLists.txt
@@ -51,6 +51,12 @@ function (setup_oiio_util_library targetname)
     target_link_options(${targetname} PRIVATE
                                ${${PROJECT_NAME}_link_options})
 
+    if (MSVC AND fmt_VERSION VERSION_GREATER_EQUAL 11.0)
+        # For MSVC, Unicode support requires compiling with /utf-8, and fmt
+        # needs this. This line adapted from fmt's CMakeLists.txt file.
+        target_compile_options(${targetname} PUBLIC $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CXX_COMPILER_ID:MSVC>>:/utf-8>)
+    endif ()
+
     target_include_directories (${targetname}
             PUBLIC
                 $<BUILD_INTERFACE:${OpenImageIO_LOCAL_DEPS_ROOT}/include>

From 7a17ba45a7742b54e975e90e4472863da7948c25 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Wed, 11 Feb 2026 16:04:03 -0800
Subject: [PATCH 53/70] fix(tiff): Improve TIFF robustness for non-matching
 tag/metadata types (#5036)

This is a more comprehensive fix for issues discovered in PR #5035.

The original problem reported in Issue #5023 was a crash when writing
TIFF information that was supposed to be arrays of more than one
rational, it was reading past the end of a memory array.

#5035 is a minimal, immediate fix to address the crashes. But in the
process, I saw a number of ways in which we were dropping metadata on
the floor when the types didn't exactly match, but that we *could*
handle with automatic conversion.

The new cases that we handle with this PR are:

* Exif RESOLUTIONUNIT tag is a short, but by convention we store it by
the name as a string in OIIO metadata, so we need to convert back to a
code (we did so for the main TIFF metadata, but not for Exif in TIFF).
* Handle Exif "version" and "flashpixversion" metadata which have
unusual encoding in TIFF files (they are 4-character strings, but must
be stored in a TIFF tag of type BYTES, not as the usual type ASCII that
most strings use.
* Handle things that TIFF insists are ASCII but that come to us as
metadata that's strings. Easy -- our `ParamValue.get_string()`
automatically converts ther things like ints or floats into string
representation.
* Much more flexibility in automatically converting among the signed and
unsigned, 16 and 32 bit, integer types when the metadata in our
ImageSpec is integer but not the specific type of integer that TIFF/Exif
thinks it should be.

This doesn't appear to change the results of anything in our testsuite,
but it's possible that some non-TIFF-to-TIFF image conversions that
contain Exif data may now do certain type conversions properly instead
of just silently dropping the metadata that had non-matching (but
reasonably valid) types.

Additionally, to do this nicely, I ended up adding a new TypeURational
alias in typedesc.h (similar to TypeRational, but the case where both
numerator and denominator are unsigned ints).

And also fixed a random comment typo I noticed in tiffinput.cpp.

---------

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/include/OpenImageIO/typedesc.h |   2 +
 src/libutil/typedesc.cpp           |   8 +
 src/tiff.imageio/tiffinput.cpp     |   2 +-
 src/tiff.imageio/tiffoutput.cpp    | 225 ++++++++++++++++++-----------
 4 files changed, 152 insertions(+), 85 deletions(-)

diff --git a/src/include/OpenImageIO/typedesc.h b/src/include/OpenImageIO/typedesc.h
index 5c38ae6209..b546dcd0e0 100644
--- a/src/include/OpenImageIO/typedesc.h
+++ b/src/include/OpenImageIO/typedesc.h
@@ -385,6 +385,7 @@ inline constexpr TypeDesc TypeHalf (TypeDesc::HALF);
 inline constexpr TypeDesc TypeTimeCode (TypeDesc::UINT, TypeDesc::SCALAR, TypeDesc::TIMECODE, 2);
 inline constexpr TypeDesc TypeKeyCode (TypeDesc::INT, TypeDesc::SCALAR, TypeDesc::KEYCODE, 7);
 inline constexpr TypeDesc TypeRational(TypeDesc::INT, TypeDesc::VEC2, TypeDesc::RATIONAL);
+inline constexpr TypeDesc TypeURational(TypeDesc::UINT, TypeDesc::VEC2, TypeDesc::RATIONAL);
 inline constexpr TypeDesc TypePointer(TypeDesc::PTR);
 inline constexpr TypeDesc TypeUstringhash(TypeDesc::USTRINGHASH);
 
@@ -648,6 +649,7 @@ using v3_1::TypeHalf;
 using v3_1::TypeTimeCode;
 using v3_1::TypeKeyCode;
 using v3_1::TypeRational;
+using v3_1::TypeURational;
 using v3_1::TypePointer;
 using v3_1::TypeUstringhash;
 #endif
diff --git a/src/libutil/typedesc.cpp b/src/libutil/typedesc.cpp
index 1c58346a7e..99dc09fed7 100644
--- a/src/libutil/typedesc.cpp
+++ b/src/libutil/typedesc.cpp
@@ -325,6 +325,8 @@ TypeDesc::fromstring(string_view typestring)
         t = OIIO::TypeTimeCode;
     else if (type == "rational")
         t = OIIO::TypeRational;
+    else if (type == "urational")
+        t = OIIO::TypeURational;
     else if (type == "box2i")
         t = OIIO::TypeBox2i;
     else if (type == "box3i")
@@ -890,6 +892,12 @@ convert_type(TypeDesc srctype, const void* src, TypeDesc dsttype, void* dst,
         ((float*)dst)[0] = den ? float(num) / float(den) : 0.0f;
         return true;
     }
+    if (dsttype == TypeFloat && srctype == TypeURational) {
+        auto num         = ((const uint32_t*)src)[0];
+        auto den         = ((const uint32_t*)src)[1];
+        ((float*)dst)[0] = den ? float(num) / float(den) : 0.0f;
+        return true;
+    }
     if (dsttype == TypeFloat && srctype == TypeString) {
         // Only succeed for a string if it exactly holds something that
         // exactly parses to a float value.
diff --git a/src/tiff.imageio/tiffinput.cpp b/src/tiff.imageio/tiffinput.cpp
index ac360d2c32..06d1394424 100644
--- a/src/tiff.imageio/tiffinput.cpp
+++ b/src/tiff.imageio/tiffinput.cpp
@@ -671,7 +671,7 @@ static std::pair<int, const char*>  tiff_input_compressions[] = {
     { COMPRESSION_NEXT,          "next" },        // NeXT 2-bit RLE
     { COMPRESSION_CCITTRLEW,     "ccittrle2" },   // #1 w/ word alignment
     { COMPRESSION_PACKBITS,      "packbits" },    // Macintosh RLE
-    { COMPRESSION_THUNDERSCAN,   "thunderscan" }, // ThundeScan RLE
+    { COMPRESSION_THUNDERSCAN,   "thunderscan" }, // ThunderScan RLE
     { COMPRESSION_IT8CTPAD,      "IT8CTPAD" },    // IT8 CT w/ patting
     { COMPRESSION_IT8LW,         "IT8LW" },       // IT8 linework RLE
     { COMPRESSION_IT8MP,         "IT8MP" },       // IT8 monochrome picture
diff --git a/src/tiff.imageio/tiffoutput.cpp b/src/tiff.imageio/tiffoutput.cpp
index 77657329de..5852d7d810 100644
--- a/src/tiff.imageio/tiffoutput.cpp
+++ b/src/tiff.imageio/tiffoutput.cpp
@@ -149,8 +149,7 @@ class TIFFOutput final : public ImageOutput {
     void fix_bitdepth(void* data, int nvals);
 
     // Add a parameter to the output
-    bool put_parameter(const std::string& name, TypeDesc type,
-                       const void* data);
+    bool put_parameter(const ParamValue& metadata);
     bool write_exif_data();
 
     // Make our best guess about whether the spec is describing data that
@@ -921,10 +920,8 @@ TIFFOutput::open(const std::string& name, const ImageSpec& userspec,
     }
 
     // Deal with all other params
-    for (size_t p = 0; p < m_spec.extra_attribs.size(); ++p)
-        put_parameter(m_spec.extra_attribs[p].name().string(),
-                      m_spec.extra_attribs[p].type(),
-                      m_spec.extra_attribs[p].data());
+    for (const auto& p : m_spec.extra_attribs)
+        put_parameter(p);
 
     if (m_spec.get_int_attribute("tiff:write_iptc")) {
         // Enable IPTC block writing only if "tiff_write_iptc" hint is explicitly
@@ -956,120 +953,125 @@ TIFFOutput::open(const std::string& name, const ImageSpec& userspec,
 
 
+inline int
+resunit_to_code(string_view s)
+{
+    if (Strutil::iequals(s, "none"))
+        return RESUNIT_NONE;
+    else if (Strutil::iequals(s, "in") || Strutil::iequals(s, "inch"))
+        return RESUNIT_INCH;
+    else if (Strutil::iequals(s, "cm"))
+        return RESUNIT_CENTIMETER;
+    return 0;
+}
+
+
+
 bool
-TIFFOutput::put_parameter(const std::string& name, TypeDesc type,
-                          const void* data)
+TIFFOutput::put_parameter(const ParamValue& param)
 {
-    if (!data || (type == TypeString && *(char**)data == nullptr)) {
+    ustring name  = param.uname();
+    TypeDesc type = param.type();
+    if (!param.data()
+        || (type == TypeString && *(char**)param.data() == nullptr)) {
         // we got a null pointer, don't set the field
         return false;
     }
-    if (Strutil::iequals(name, "Artist") && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_ARTIST, *(char**)data);
+    if (Strutil::iequals(name, "Artist")) {
+        TIFFSetField(m_tif, TIFFTAG_ARTIST, param.get_string().c_str());
         return true;
     }
-    if (Strutil::iequals(name, "Copyright") && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_COPYRIGHT, *(char**)data);
+    if (Strutil::iequals(name, "Copyright")) {
+        TIFFSetField(m_tif, TIFFTAG_COPYRIGHT, param.get_string().c_str());
         return true;
     }
-    if (Strutil::iequals(name, "DateTime") && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_DATETIME, *(char**)data);
+    if (Strutil::iequals(name, "DateTime") && type == TypeString) {
+        TIFFSetField(m_tif, TIFFTAG_DATETIME, param.get_string().c_str());
         return true;
     }
-    if ((Strutil::iequals(name, "name")
-         || Strutil::iequals(name, "DocumentName"))
-        && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_DOCUMENTNAME, *(char**)data);
+    if (Strutil::iequals(name, "name")
+        || Strutil::iequals(name, "DocumentName")) {
+        TIFFSetField(m_tif, TIFFTAG_DOCUMENTNAME, param.get_string().c_str());
         return true;
     }
-    if (Strutil::iequals(name, "fovcot") && type == TypeDesc::FLOAT) {
-        double d = *(float*)data;
-        TIFFSetField(m_tif, TIFFTAG_PIXAR_FOVCOT, d);
+    if (Strutil::iequals(name, "fovcot") && type == TypeFloat) {
+        TIFFSetField(m_tif, TIFFTAG_PIXAR_FOVCOT, param.get_float());
         return true;
     }
-    if ((Strutil::iequals(name, "host")
-         || Strutil::iequals(name, "HostComputer"))
-        && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_HOSTCOMPUTER, *(char**)data);
+    if (Strutil::iequals(name, "host")
+        || Strutil::iequals(name, "HostComputer")) {
+        TIFFSetField(m_tif, TIFFTAG_HOSTCOMPUTER, param.get_string().c_str());
         return true;
     }
     if ((Strutil::iequals(name, "description")
          || Strutil::iequals(name, "ImageDescription"))
-        && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_IMAGEDESCRIPTION, *(char**)data);
+        && type == TypeString) {
+        TIFFSetField(m_tif, TIFFTAG_IMAGEDESCRIPTION,
+                     param.get_string().c_str());
         return true;
     }
-    if (Strutil::iequals(name, "tiff:Predictor") && type == TypeDesc::INT) {
-        m_predictor = *(int*)data;
+    if (Strutil::iequals(name, "tiff:Predictor")) {
+        m_predictor = param.get_int();
         TIFFSetField(m_tif, TIFFTAG_PREDICTOR, m_predictor);
         return true;
     }
-    if (Strutil::iequals(name, "ResolutionUnit") && type == TypeDesc::STRING) {
-        const char* s = *(char**)data;
-        bool ok       = true;
-        if (Strutil::iequals(s, "none"))
-            TIFFSetField(m_tif, TIFFTAG_RESOLUTIONUNIT, RESUNIT_NONE);
-        else if (Strutil::iequals(s, "in") || Strutil::iequals(s, "inch"))
-            TIFFSetField(m_tif, TIFFTAG_RESOLUTIONUNIT, RESUNIT_INCH);
-        else if (Strutil::iequals(s, "cm"))
-            TIFFSetField(m_tif, TIFFTAG_RESOLUTIONUNIT, RESUNIT_CENTIMETER);
-        else
-            ok = false;
-        return ok;
+    if (Strutil::iequals(name, "ResolutionUnit") && type == TypeString) {
+        if (int r = resunit_to_code(param.get_string())) {
+            TIFFSetField(m_tif, TIFFTAG_RESOLUTIONUNIT, r);
+            return true;
+        }
+        return false;
     }
     if (Strutil::iequals(name, "tiff:RowsPerStrip")
         && !m_spec.tile_width /* don't set rps for tiled files */
         && m_planarconfig == PLANARCONFIG_CONTIG /* only for contig */) {
-        if (type == TypeDesc::INT) {
-            m_rowsperstrip = *(int*)data;
-        } else if (type == TypeDesc::STRING) {
-            // Back-compatibility with Entropy and PRMan
-            m_rowsperstrip = Strutil::stoi(*(char**)data);
-        } else {
+        int rps = param.get_int();
+        if (rps <= 0)
             return false;
-        }
-        m_rowsperstrip = clamp(m_rowsperstrip, 1, m_spec.height);
+        m_rowsperstrip = clamp(rps, 1, m_spec.height);
         TIFFSetField(m_tif, TIFFTAG_ROWSPERSTRIP, m_rowsperstrip);
         return true;
     }
-    if (Strutil::iequals(name, "Make") && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_MAKE, *(char**)data);
+    if (Strutil::iequals(name, "Make")) {
+        TIFFSetField(m_tif, TIFFTAG_MAKE, param.get_string().c_str());
         return true;
     }
-    if (Strutil::iequals(name, "Model") && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_MODEL, *(char**)data);
+    if (Strutil::iequals(name, "Model")) {
+        TIFFSetField(m_tif, TIFFTAG_MODEL, param.get_string().c_str());
         return true;
     }
-    if (Strutil::iequals(name, "Software") && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_SOFTWARE, *(char**)data);
+    if (Strutil::iequals(name, "Software")) {
+        TIFFSetField(m_tif, TIFFTAG_SOFTWARE, param.get_string().c_str());
         return true;
     }
-    if (Strutil::iequals(name, "tiff:SubFileType") && type == TypeDesc::INT) {
-        TIFFSetField(m_tif, TIFFTAG_SUBFILETYPE, *(int*)data);
+    if (Strutil::iequals(name, "tiff:SubFileType")) {
+        TIFFSetField(m_tif, TIFFTAG_SUBFILETYPE, param.get_int());
         return true;
     }
-    if (Strutil::iequals(name, "textureformat") && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_PIXAR_TEXTUREFORMAT, *(char**)data);
+    if (Strutil::iequals(name, "textureformat")) {
+        TIFFSetField(m_tif, TIFFTAG_PIXAR_TEXTUREFORMAT,
+                     param.get_string().c_str());
         return true;
     }
-    if (Strutil::iequals(name, "wrapmodes") && type == TypeDesc::STRING) {
-        TIFFSetField(m_tif, TIFFTAG_PIXAR_WRAPMODES, *(char**)data);
+    if (Strutil::iequals(name, "wrapmodes")) {
+        TIFFSetField(m_tif, TIFFTAG_PIXAR_WRAPMODES,
+                     param.get_string().c_str());
         return true;
     }
     if (Strutil::iequals(name, "worldtocamera") && type == TypeMatrix) {
-        TIFFSetField(m_tif, TIFFTAG_PIXAR_MATRIX_WORLDTOCAMERA, data);
+        TIFFSetField(m_tif, TIFFTAG_PIXAR_MATRIX_WORLDTOCAMERA, param.data());
         return true;
     }
     if (Strutil::iequals(name, "worldtoscreen") && type == TypeMatrix) {
-        TIFFSetField(m_tif, TIFFTAG_PIXAR_MATRIX_WORLDTOSCREEN, data);
+        TIFFSetField(m_tif, TIFFTAG_PIXAR_MATRIX_WORLDTOSCREEN, param.data());
         return true;
     }
-    if (Strutil::iequals(name, "XResolution") && type == TypeDesc::FLOAT) {
-        TIFFSetField(m_tif, TIFFTAG_XRESOLUTION, *(float*)data);
+    if (Strutil::iequals(name, "XResolution")) {
+        TIFFSetField(m_tif, TIFFTAG_XRESOLUTION, param.get_float());
         return true;
     }
-    if (Strutil::iequals(name, "YResolution") && type == TypeDesc::FLOAT) {
-        TIFFSetField(m_tif, TIFFTAG_YRESOLUTION, *(float*)data);
+    if (Strutil::iequals(name, "YResolution")) {
+        TIFFSetField(m_tif, TIFFTAG_YRESOLUTION, param.get_float());
         return true;
     }
     return false;
@@ -1134,27 +1136,82 @@ TIFFOutput::write_exif_data()
         int tag, tifftype, count;
         if (exif_tag_lookup(p.name(), tag, tifftype, count)
             && tifftype != TIFF_NOTYPE) {
+            bool ok      = false;
+            bool handled = false;
+            // Some special cases first
             if (tag == EXIF_SECURITYCLASSIFICATION || tag == EXIF_IMAGEHISTORY
-                || tag == EXIF_PHOTOGRAPHICSENSITIVITY)
+                || tag == EXIF_PHOTOGRAPHICSENSITIVITY) {
                 continue;  // libtiff doesn't understand these
-            bool ok = false;
-            if (tifftype == TIFF_ASCII) {
-                ok = TIFFSetField(m_tif, tag, *(char**)p.data());
-            } else if ((tifftype == TIFF_SHORT || tifftype == TIFF_LONG)
-                       && p.type() == TypeDesc::SHORT && count == 1) {
-                ok = TIFFSetField(m_tif, tag, (int)*(short*)p.data());
-            } else if ((tifftype == TIFF_SHORT || tifftype == TIFF_LONG)
-                       && p.type() == TypeDesc::INT && count == 1) {
-                ok = TIFFSetField(m_tif, tag, *(int*)p.data());
-            } else if ((tifftype == TIFF_RATIONAL || tifftype == TIFF_SRATIONAL)
-                       && p.type() == TypeDesc::FLOAT && count == 1) {
-                ok = TIFFSetField(m_tif, tag, *(float*)p.data());
+            }
+            if (tag == TIFFTAG_RESOLUTIONUNIT && p.type() == TypeString) {
+                // OIIO stores resolution unit as a string, but libtiff wants
+                // it as a short code, so we have to convert.
+                if (int r = resunit_to_code(p.get_string())) {
+                    ok = TIFFSetField(m_tif, TIFFTAG_RESOLUTIONUNIT, r);
+                }
+                handled = true;
+            } else if (tag == EXIF_EXIFVERSION || tag == EXIF_FLASHPIXVERSION) {
+                if (p.type() == TypeString) {
+                    // These tags are a 4-byte array of chars, but we
+                    // allow users to set it as a string. Convert it if needed.
+                    std::string version = p.get_string();
+                    if (version.size() >= 4) {
+                        ok = TIFFSetField(m_tif, tag, version.c_str());
+                    }
+                    handled = true;
+                } else if (p.type() == TypeInt) {
+                    std::string s = Strutil::fmt::format("{:04}", p.get_int());
+                    if (s.size() == 4)
+                        ok = TIFFSetField(m_tif, tag, s.c_str());
+                    handled = true;
+                }
+            }
+            // General cases...
+            else if (tifftype == TIFF_ASCII) {
+                ok      = TIFFSetField(m_tif, tag, p.get_string().c_str());
+                handled = true;
+            } else if (tifftype == TIFF_SHORT || tifftype == TIFF_SSHORT
+                       || tifftype == TIFF_LONG || tifftype == TIFF_SLONG) {
+                if ((p.type() == TypeInt16 || p.type() == TypeInt32
+                     || p.type() == TypeUInt16 || p.type() == TypeUInt32)
+                    && count == 1) {
+                    // Passing our kinda-int as TIFF kinda-int
+                    ok      = TIFFSetField(m_tif, tag, p.get_int());
+                    handled = true;
+                } else if (p.type() == TypeString && count == 1) {
+                    // Passing our string as TIFF kinda-int -- convert as long
+                    // as the string looks like an int.
+                    std::string s = p.get_string();
+                    if (Strutil::string_is_int(s)) {
+                        int val = Strutil::stoi(s);
+                        ok      = TIFFSetField(m_tif, tag, val);
+                        handled = true;
+                    }
+                }
             } else if ((tifftype == TIFF_RATIONAL || tifftype == TIFF_SRATIONAL)
-                       && p.type() == TypeDesc::DOUBLE && count == 1) {
-                ok = TIFFSetField(m_tif, tag, *(double*)p.data());
+                       && (p.type() == TypeFloat || p.type() == TypeDesc::DOUBLE
+                           || p.type() == TypeUInt16 || p.type() == TypeUInt32
+                           || p.type() == TypeInt16 || p.type() == TypeInt32
+                           || p.type() == TypeRational
+                           || p.type() == TypeURational)
+                       && count == 1) {
+                // If the tag is a rational, there are a number of types we
+                // can force into that form by converting to and then passing
+                // a float.
+                ok      = TIFFSetField(m_tif, tag, p.get_float());
+                handled = true;
+            }
+            if (!handled) {
+#    ifndef NDEBUG
+                print("Unhandled EXIF {} ({}) / tag {} tifftype {} count {}\n",
+                      p.name(), p.type(), tag, tifftype, count);
+#    endif
             }
+            // NOTE: We are not handling arrays of values, just scalars.
             if (!ok) {
-                // std::cout << "Unhandled EXIF " << p.name() << " " << p.type() << "\n";
+                // print(
+                //     "Error handling EXIF {} ({}) / tag {} tifftype {} count {}\n",
+                //     p.name(), p.type(), tag, tifftype, count);
             }
         }
     }

From 61f3882a1cde325066803ae5246c9e3de7a94d3e Mon Sep 17 00:00:00 2001
From: Lumina Wang <143051772+adskWangl@users.noreply.github.com>
Date: Thu, 12 Feb 2026 14:51:37 -0500
Subject: [PATCH 54/70] fix: gamma precision (#5038)

This is a PR proposing to keep the gamma precision.

In some cases, we need more precise gamma values, while the existing
rounding operation loses most of the precision. This change will
continue to use rounded values to calculate and store color space
information, but retain the original value in the "Gamma" parameter. In
addition, it can also tidy up existing code.

I've verified with png/exif.png & python-colorconfig tests. No
regression is introduced.

Signed-off-by: Lumina Wang <lumina.wang@autodesk.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/hdr.imageio/hdrinput.cpp      |  5 -----
 src/libOpenImageIO/color_ocio.cpp | 18 +++++++++++-------
 src/png.imageio/png_pvt.h         |  5 -----
 src/rla.imageio/rlainput.cpp      |  5 -----
 src/targa.imageio/targainput.cpp  |  5 -----
 5 files changed, 11 insertions(+), 27 deletions(-)

diff --git a/src/hdr.imageio/hdrinput.cpp b/src/hdr.imageio/hdrinput.cpp
index 850899d89d..9627b9b3ed 100644
--- a/src/hdr.imageio/hdrinput.cpp
+++ b/src/hdr.imageio/hdrinput.cpp
@@ -304,12 +304,7 @@ HdrInput::RGBE_ReadHeader()
             found_FORMAT_line = true;
             /* LG says no:    break;       // format found so break out of loop */
         } else if (Strutil::parse_values(line, "GAMMA=", span<float>(tempf))) {
-            // Round gamma to the nearest hundredth to prevent stupid
-            // precision choices and make it easier for apps to make
-            // decisions based on known gamma values. For example, you want
-            // 2.2, not 2.19998.
             float g = float(1.0 / tempf);
-            g       = roundf(100.0 * g) / 100.0f;
             set_colorspace_rec709_gamma(m_spec, g);
         } else if (Strutil::parse_values(line,
                                          "EXPOSURE=", span<float>(tempf))) {
diff --git a/src/libOpenImageIO/color_ocio.cpp b/src/libOpenImageIO/color_ocio.cpp
index a365cd9b48..b458673840 100644
--- a/src/libOpenImageIO/color_ocio.cpp
+++ b/src/libOpenImageIO/color_ocio.cpp
@@ -2803,21 +2803,25 @@ ColorConfig::set_colorspace(ImageSpec& spec, string_view colorspace) const
 void
 ColorConfig::set_colorspace_rec709_gamma(ImageSpec& spec, float gamma) const
 {
-    gamma = std::round(gamma * 100.0f) / 100.0f;
-    if (fabsf(gamma - 1.0f) <= 0.01f) {
+    // Round gamma to the nearest hundredth to prevent stupid precision choices
+    // and make it easier for apps to make decisions based on known gamma values.
+    float g_rounded = std::round(gamma * 100.0f) / 100.0f;
+    if (fabsf(g_rounded - 1.0f) <= 0.01f) {
         set_colorspace(spec, "lin_rec709_scene");
-    } else if (fabsf(gamma - 1.8f) <= 0.01f) {
+    } else if (fabsf(g_rounded - 1.8f) <= 0.01f) {
         set_colorspace(spec, "g18_rec709_scene");
         spec.attribute("oiio:Gamma", 1.8f);
-    } else if (fabsf(gamma - 2.2f) <= 0.01f) {
+    } else if (fabsf(g_rounded - 2.2f) <= 0.01f) {
         set_colorspace(spec, "g22_rec709_scene");
         spec.attribute("oiio:Gamma", 2.2f);
-    } else if (fabsf(gamma - 2.4f) <= 0.01f) {
+    } else if (fabsf(g_rounded - 2.4f) <= 0.01f) {
         set_colorspace(spec, "g24_rec709_scene");
         spec.attribute("oiio:Gamma", 2.4f);
     } else {
-        set_colorspace(spec, Strutil::fmt::format("g{}_rec709_scene",
-                                                  std::lround(gamma * 10.0f)));
+        set_colorspace(spec,
+                       Strutil::fmt::format("g{}_rec709_scene",
+                                            std::lround(g_rounded * 10.0f)));
+        // Preserve the original gamma value for use in color conversions.
         spec.attribute("oiio:Gamma", gamma);
     }
 }
diff --git a/src/png.imageio/png_pvt.h b/src/png.imageio/png_pvt.h
index f7b6bbbb62..3631910192 100644
--- a/src/png.imageio/png_pvt.h
+++ b/src/png.imageio/png_pvt.h
@@ -226,12 +226,7 @@ read_info(png_structp& sp, png_infop& ip, int& bit_depth, int& color_type,
     if (png_get_sRGB(sp, ip, &srgb_intent)) {
         spec.attribute("oiio:ColorSpace", "srgb_rec709_scene");
     } else if (png_get_gAMA(sp, ip, &gamma) && gamma > 0.0) {
-        // Round gamma to the nearest hundredth to prevent stupid
-        // precision choices and make it easier for apps to make
-        // decisions based on known gamma values. For example, you want
-        // 2.2, not 2.19998.
         float g = float(1.0 / gamma);
-        g       = roundf(100.0f * g) / 100.0f;
         set_colorspace_rec709_gamma(spec, g);
     } else {
         // If there's no info at all, assume sRGB.
diff --git a/src/rla.imageio/rlainput.cpp b/src/rla.imageio/rlainput.cpp
index bd1215dacc..4befaffe2b 100644
--- a/src/rla.imageio/rlainput.cpp
+++ b/src/rla.imageio/rlainput.cpp
@@ -397,11 +397,6 @@ RLAInput::seek_subimage(int subimage, int miplevel)
 
     float gamma = Strutil::from_string<float>(m_rla.Gamma);
     if (gamma > 0.f) {
-        // Round gamma to the nearest hundredth to prevent stupid
-        // precision choices and make it easier for apps to make
-        // decisions based on known gamma values. For example, you want
-        // 2.2, not 2.19998.
-        gamma = roundf(100.0 * gamma) / 100.0f;
         set_colorspace_rec709_gamma(m_spec, gamma);
     }
 
diff --git a/src/targa.imageio/targainput.cpp b/src/targa.imageio/targainput.cpp
index 984f772ffa..45453965ea 100644
--- a/src/targa.imageio/targainput.cpp
+++ b/src/targa.imageio/targainput.cpp
@@ -435,11 +435,6 @@ TGAInput::read_tga2_header()
             if (bigendian())
                 swap_endian(&buf.s[0], 2);
             float gamma = (float)buf.s[0] / (float)buf.s[1];
-            // Round gamma to the nearest hundredth to prevent stupid
-            // precision choices and make it easier for apps to make
-            // decisions based on known gamma values. For example, you want
-            // 2.2, not 2.19998.
-            gamma = roundf(100.0 * gamma) / 100.0f;
             set_colorspace_rec709_gamma(m_spec, gamma);
         }
 

From dc86d6676927c320301e18ca82e9019ab0eb6d7b Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Thu, 12 Feb 2026 21:29:24 -0800
Subject: [PATCH 55/70] ci: Turn off nightly workflows for user forks (#5042)

Also switch to a better idiom for detecting if we're a fork.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/analysis.yml  |  2 +-
 .github/workflows/ci.yml        | 11 +++++------
 .github/workflows/docs.yml      |  2 +-
 .github/workflows/scorecard.yml |  3 +--
 .github/workflows/wheel.yml     | 20 +++++++-------------
 5 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/analysis.yml b/.github/workflows/analysis.yml
index db23ee6f55..aaf69defe6 100644
--- a/.github/workflows/analysis.yml
+++ b/.github/workflows/analysis.yml
@@ -52,7 +52,7 @@ jobs:
     name: "SonarCloud Analysis"
     # Exclude runs on forks, since only the main org has the SonarCloud
     # account credentials.
-    if: github.repository == 'AcademySoftwareFoundation/OpenImageIO'
+    if: github.event.repository.fork == false
     uses: ./.github/workflows/build-steps.yml
     # Must let the called steps workflow inherit necessary secrets
     secrets:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 83cf2b0f3e..8947ea5682 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -27,7 +27,6 @@ on:
   schedule:
     # Full nightly build
     - cron: "0 8 * * *"
-      if: github.repository == 'AcademySoftwareFoundation/OpenImageIO'
   workflow_dispatch:
     # This allows manual triggering of the workflow from the web
 
@@ -42,7 +41,7 @@ concurrency:
 jobs:
 
   aswf-old:
-    if: ${{ ! contains(github.ref, 'windows-only') && ! contains(github.ref, 'macos-only') }}
+    if: ${{ (github.event.repository.fork == false || github.event_name != 'schedule') && ! contains(github.ref, 'windows-only') && ! contains(github.ref, 'macos-only') }}
     name: "(old) ${{matrix.desc}}"
     uses: ./.github/workflows/build-steps.yml
     with:
@@ -191,7 +190,7 @@ jobs:
   # Linux Tests using ASWF-docker containers
   #
   linux-aswf:
-    if: ${{ ! contains(github.ref, 'windows-only') && ! contains(github.ref, 'macos-only') }}
+    if: ${{ (github.event.repository.fork == false || github.event_name != 'schedule') && ! contains(github.ref, 'windows-only') && ! contains(github.ref, 'macos-only') }}
     name: "${{matrix.desc}}"
     uses: ./.github/workflows/build-steps.yml
     with:
@@ -388,7 +387,7 @@ jobs:
   # Linux Tests using GHA Ubuntu runners directly
   #
   linux-ubuntu:
-    if: ${{ ! contains(github.ref, 'windows-only') && ! contains(github.ref, 'macos-only') }}
+    if: ${{ (github.event.repository.fork == false || github.event_name != 'schedule') && ! contains(github.ref, 'windows-only') && ! contains(github.ref, 'macos-only') }}
     name: "${{matrix.desc}}"
     uses: ./.github/workflows/build-steps.yml
     with:
@@ -580,7 +579,7 @@ jobs:
   # MacOS Tests
   #
   macos:
-    if: ${{ ! contains(github.ref, 'windows-only') && ! contains(github.ref, 'linux-only') }}
+    if: ${{ (github.event.repository.fork == false || github.event_name != 'schedule') && ! contains(github.ref, 'windows-only') && ! contains(github.ref, 'linux-only') }}
     name: "${{matrix.desc}}"
     uses: ./.github/workflows/build-steps.yml
     with:
@@ -653,7 +652,7 @@ jobs:
   # Windows Tests
   #
   windows:
-    if: ${{ ! contains(github.ref, 'linux-only') && ! contains(github.ref, 'macos-only') }}
+    if: ${{ (github.event.repository.fork == false || github.event_name != 'schedule') && ! contains(github.ref, 'linux-only') && ! contains(github.ref, 'macos-only') }}
     name: "${{matrix.desc}}"
     uses: ./.github/workflows/build-steps.yml
     with:
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 4eb8445841..9d3e3c0da4 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -38,7 +38,6 @@ on:
   schedule:
     # Full nightly build
     - cron: "0 8 * * *"
-      if: github.repository == 'AcademySoftwareFoundation/OpenImageIO'
   workflow_dispatch:
     # This allows manual triggering of the workflow from the web
 
@@ -53,6 +52,7 @@ concurrency:
 jobs:
   docs:
     name: "Docs / ${{matrix.desc}}"
+    if: ${{ github.event_name != 'schedule' || github.event.repository.fork == false }}
     uses: ./.github/workflows/build-steps.yml
     with:
       nametag: ${{ matrix.nametag || 'unnamed!' }}
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index 41f18cd349..18c775e87d 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -10,7 +10,6 @@ on:
   push:
     # Run on pushes to main, but only the official repo, not forks
     branches: [ "main" ]
-    if: github.event.pull_request.head.repo.full_name == github.repository
   pull_request:
     # Only run on individual PRs if the workflows changed
     paths:
@@ -28,7 +27,7 @@ concurrency:
 jobs:
   analysis:
     name: Scorecards analysis
-    if: github.repository == 'AcademySoftwareFoundation/OpenImageIO'
+    if: github.event.repository.fork == false
     runs-on: ubuntu-latest
     permissions:
       # Needed to upload the results to code-scanning dashboard.
diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml
index 84f6145a0d..70eaa59e32 100644
--- a/.github/workflows/wheel.yml
+++ b/.github/workflows/wheel.yml
@@ -59,8 +59,7 @@ jobs:
     name: Build SDist
     runs-on: ubuntu-latest
     if: |
-      github.event_name != 'schedule' ||
-      github.repository == 'AcademySoftwareFoundation/OpenImageIO'
+      github.event_name != 'schedule' || github.event.repository.fork == false
 
     steps:
 
@@ -86,8 +85,7 @@ jobs:
     name: Build wheels on Linux
     runs-on: ubuntu-latest
     if: |
-      github.event_name != 'schedule' ||
-      github.repository == 'AcademySoftwareFoundation/OpenImageIO'
+      ${{ github.event_name != 'schedule' || github.event.repository.fork == false }}
     strategy:
       matrix:
         include:
@@ -192,8 +190,7 @@ jobs:
     name: Build wheels on Linux ARM
     runs-on: ubuntu-24.04-arm
     if: |
-      github.event_name != 'schedule' ||
-      github.repository == 'AcademySoftwareFoundation/OpenImageIO'
+      github.event_name != 'schedule' || github.event.repository.fork == false
     strategy:
         matrix:
           include:
@@ -294,8 +291,7 @@ jobs:
     name: Build wheels on macOS
     runs-on: macos-15-intel
     if: |
-      github.event_name != 'schedule' ||
-      github.repository == 'AcademySoftwareFoundation/OpenImageIO'
+      github.event_name != 'schedule' || github.event.repository.fork == false
     strategy:
       matrix:
         include:
@@ -383,8 +379,7 @@ jobs:
     name: Build wheels on macOS ARM
     runs-on: macos-14
     if: |
-      github.event_name != 'schedule' ||
-      github.repository == 'AcademySoftwareFoundation/OpenImageIO'
+      github.event_name != 'schedule' || github.event.repository.fork == false
     strategy:
       matrix:
         include:
@@ -463,8 +458,7 @@ jobs:
     name: Build wheels on Windows
     runs-on: windows-2022
     if: |
-      github.event_name != 'schedule' ||
-      github.repository == 'AcademySoftwareFoundation/OpenImageIO'
+      github.event_name != 'schedule' || github.event.repository.fork == false
     strategy:
       matrix:
         include:
@@ -522,7 +516,7 @@ jobs:
     runs-on: ubuntu-latest
     permissions:
       id-token: write
-    if: github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/v3.0.') || startsWith(github.event.ref, 'refs/tags/v3.1.')) && github.repository == 'AcademySoftwareFoundation/OpenImageIO'
+    if: github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/v3.0.') || startsWith(github.event.ref, 'refs/tags/v3.1.')) && github.event.repository.fork == false
     steps:
       - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
 

From 5e42e7bbbdf7d974c26639d3ec73f092b02245b5 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Fri, 13 Feb 2026 21:38:19 +0100
Subject: [PATCH 56/70] feat(heif): Monochrome channel read and write support,
 fix crash (#5043)

Implement support for reading and writing monochrome. Reading requires
libheif 1.17+ for heif_image_handle_get_preferred_decoding_colorspace.

Previously writing a single channel image would cause an exception due
to
wrong parameters, but close() would continue writing the image and
crash.
Destroy m_ctx on exception to prevent that for other potential errors.

Test added for monochrome read and write.

---------

Signed-off-by: Brecht Van Lommel <brecht@blender.org>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 INSTALL.md                                    |   4 +-
 src/cmake/externalpackages.cmake              |   2 +-
 src/doc/builtinplugins.rst                    |   7 +
 src/heif.imageio/heifinput.cpp                |  61 ++++--
 src/heif.imageio/heifoutput.cpp               |  21 +-
 testsuite/heif/ref/out-libheif1.12-orient.txt |  14 ++
 .../heif/ref/out-libheif1.21-with-av1.txt     |  12 ++
 testsuite/heif/ref/out-libheif1.21.txt        | 196 ++++++++++++++++++
 testsuite/heif/ref/out-libheif1.4.txt         |  14 ++
 testsuite/heif/ref/out-libheif1.5.txt         |  14 ++
 testsuite/heif/ref/out-libheif1.9-alt2.txt    |  14 ++
 .../heif/ref/out-libheif1.9-with-av1-alt2.txt |  14 ++
 .../heif/ref/out-libheif1.9-with-av1.txt      |  14 ++
 testsuite/heif/ref/out-libheif1.9.txt         |  14 ++
 testsuite/heif/run.py                         |   6 +
 15 files changed, 381 insertions(+), 26 deletions(-)
 create mode 100644 testsuite/heif/ref/out-libheif1.21.txt

diff --git a/INSTALL.md b/INSTALL.md
index ee0d73eee8..a52378c5ba 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -61,8 +61,8 @@ NEW or CHANGED MINIMUM dependencies since the last major release are **bold**.
  * If you want support for GIF images:
      * giflib >= 5.0 (tested through 5.2.2)
  * If you want support for HEIF/HEIC or AVIF images:
-     * libheif >= 1.11 (1.16 required for correct orientation support,
-       tested through 1.21.1)
+     * libheif >= 1.11 (1.16 required for correct orientation support and
+       1.17 required for monochrome HEIC support; tested through 1.21.1)
      * libheif must be built with an AV1 encoder/decoder for AVIF support.
  * If you want support for DICOM medical image files:
      * DCMTK >= 3.6.1 (tested through 3.6.9)
diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake
index ed267c4bdf..c9b729d736 100644
--- a/src/cmake/externalpackages.cmake
+++ b/src/cmake/externalpackages.cmake
@@ -165,7 +165,7 @@ checked_find_package (GIF VERSION_MIN 5.0)
 checked_find_package (Libheif VERSION_MIN 1.11
                       PREFER_CONFIG
                       RECOMMEND_MIN 1.16
-                      RECOMMEND_MIN_REASON "for orientation support")
+                      RECOMMEND_MIN_REASON "1.16 for orientation support, 1.17 for monochrome support")
 
 checked_find_package (LibRaw
                       VERSION_MIN 0.20.0
diff --git a/src/doc/builtinplugins.rst b/src/doc/builtinplugins.rst
index 05c66fe352..92c8a5f6bf 100644
--- a/src/doc/builtinplugins.rst
+++ b/src/doc/builtinplugins.rst
@@ -837,6 +837,13 @@ control aspects of the writing itself:
 
 |
 
+**Additional notes and limitations**
+
+* The underlying libheif dependency must be 1.16 or newer to support the
+  "oiio:reorient" configuration option and the "heif:Orientation" metadata.
+* The underlying libheif dependency must be 1.17 or newer to support
+  monochrome HEIC images.
+
 .. _sec-bundledplugins-ico:
 
 ICO
diff --git a/src/heif.imageio/heifinput.cpp b/src/heif.imageio/heifinput.cpp
index f4b78aae65..a49350a215 100644
--- a/src/heif.imageio/heifinput.cpp
+++ b/src/heif.imageio/heifinput.cpp
@@ -262,15 +262,40 @@ HeifInput::seek_subimage(int subimage, int miplevel)
     }
 
     m_has_alpha = m_ihandle.has_alpha_channel();
-    auto chroma = m_has_alpha        ? (m_bitdepth > 8)
-                                           ? littleendian()
-                                                 ? heif_chroma_interleaved_RRGGBBAA_LE
-                                                 : heif_chroma_interleaved_RRGGBBAA_BE
-                                           : heif_chroma_interleaved_RGBA
-                  : (m_bitdepth > 8) ? littleendian()
-                                           ? heif_chroma_interleaved_RRGGBB_LE
-                                           : heif_chroma_interleaved_RRGGBB_BE
-                                     : heif_chroma_interleaved_RGB;
+
+    bool is_monochrome = false;
+
+#if LIBHEIF_NUMERIC_VERSION >= MAKE_LIBHEIF_VERSION(1, 17, 0, 0)
+    heif_colorspace preferred_colorspace = heif_colorspace_undefined;
+    heif_chroma preferred_chroma         = heif_chroma_undefined;
+
+    if (heif_image_handle_get_preferred_decoding_colorspace(
+            m_ihandle.get_raw_image_handle(), &preferred_colorspace,
+            &preferred_chroma)
+            .code
+        == heif_error_Ok) {
+        is_monochrome = preferred_colorspace == heif_colorspace_monochrome;
+    }
+#endif
+
+    const heif_chroma chroma
+        = (is_monochrome)    ? heif_chroma_monochrome
+          : m_has_alpha      ? (m_bitdepth > 8)
+                                   ? littleendian()
+                                         ? heif_chroma_interleaved_RRGGBBAA_LE
+                                         : heif_chroma_interleaved_RRGGBBAA_BE
+                                   : heif_chroma_interleaved_RGBA
+          : (m_bitdepth > 8) ? littleendian()
+                                   ? heif_chroma_interleaved_RRGGBB_LE
+                                   : heif_chroma_interleaved_RRGGBB_BE
+                             : heif_chroma_interleaved_RGB;
+    const heif_colorspace colorspace = is_monochrome
+                                           ? heif_colorspace_monochrome
+                                           : heif_colorspace_RGB;
+    const heif_channel channel       = is_monochrome ? heif_channel_Y
+                                                     : heif_channel_interleaved;
+    const int nchannels              = is_monochrome ? 1 : m_has_alpha ? 4 : 3;
+
 #if 0
     try {
         m_himage = m_ihandle.decode_image(heif_colorspace_RGB, chroma);
@@ -290,8 +315,8 @@ HeifInput::seek_subimage(int subimage, int miplevel)
     // print("Got decoding options version {}\n", options->version);
     struct heif_image* img_tmp = nullptr;
     struct heif_error herr = heif_decode_image(m_ihandle.get_raw_image_handle(),
-                                               &img_tmp, heif_colorspace_RGB,
-                                               chroma, options.get());
+                                               &img_tmp, colorspace, chroma,
+                                               options.get());
     if (img_tmp)
         m_himage = heif::Image(img_tmp);
     if (herr.code != heif_error_Ok || !img_tmp) {
@@ -301,9 +326,8 @@ HeifInput::seek_subimage(int subimage, int miplevel)
     }
 #endif
 
-    m_spec = ImageSpec(m_himage.get_width(heif_channel_interleaved),
-                       m_himage.get_height(heif_channel_interleaved),
-                       m_has_alpha ? 4 : 3,
+    m_spec = ImageSpec(m_himage.get_width(channel),
+                       m_himage.get_height(channel), nchannels,
                        (m_bitdepth > 8) ? TypeUInt16 : TypeUInt8);
 
     if (m_bitdepth > 8) {
@@ -492,12 +516,13 @@ HeifInput::read_native_scanline(int subimage, int miplevel, int y, int /*z*/,
 #else
     int ystride = 0;
 #endif
+    const heif_channel channel = m_spec.nchannels == 1
+                                     ? heif_channel_Y
+                                     : heif_channel_interleaved;
 #if LIBHEIF_NUMERIC_VERSION >= MAKE_LIBHEIF_VERSION(1, 20, 2, 0)
-    const uint8_t* hdata = m_himage.get_plane2(heif_channel_interleaved,
-                                               &ystride);
+    const uint8_t* hdata = m_himage.get_plane2(channel, &ystride);
 #else
-    const uint8_t* hdata = m_himage.get_plane(heif_channel_interleaved,
-                                              &ystride);
+    const uint8_t* hdata = m_himage.get_plane(channel, &ystride);
 #endif
     if (!hdata) {
         errorfmt("Unknown read error");
diff --git a/src/heif.imageio/heifoutput.cpp b/src/heif.imageio/heifoutput.cpp
index 8cfa40afd7..2a2fcf2744 100644
--- a/src/heif.imageio/heifoutput.cpp
+++ b/src/heif.imageio/heifoutput.cpp
@@ -137,10 +137,16 @@ HeifOutput::open(const std::string& name, const ImageSpec& newspec,
                 (m_bitdepth == 8) ? heif_chroma_interleaved_RGBA
                 : littleendian()  ? heif_chroma_interleaved_RRGGBBAA_LE
                                   : heif_chroma_interleaved_RRGGBBAA_BE };
-        m_himage.create(newspec.width, newspec.height, heif_colorspace_RGB,
+        const heif_colorspace colorspace = (m_spec.nchannels == 1)
+                                               ? heif_colorspace_monochrome
+                                               : heif_colorspace_RGB;
+        const heif_channel channel       = (m_spec.nchannels == 1)
+                                               ? heif_channel_Y
+                                               : heif_channel_interleaved;
+
+        m_himage.create(newspec.width, newspec.height, colorspace,
                         chromas[m_spec.nchannels]);
-        m_himage.add_plane(heif_channel_interleaved, newspec.width,
-                           newspec.height, m_bitdepth);
+        m_himage.add_plane(channel, newspec.width, newspec.height, m_bitdepth);
 
         auto compqual  = m_spec.decode_compression_metadata("", 75);
         auto extension = Filesystem::extension(m_filename);
@@ -153,10 +159,12 @@ HeifOutput::open(const std::string& name, const ImageSpec& newspec,
     } catch (const heif::Error& err) {
         std::string e = err.get_message();
         errorfmt("{}", e.empty() ? "unknown exception" : e.c_str());
+        m_ctx.reset();
         return false;
     } catch (const std::exception& err) {
         std::string e = err.what();
         errorfmt("{}", e.empty() ? "unknown exception" : e.c_str());
+        m_ctx.reset();
         return false;
     }
 
@@ -180,10 +188,13 @@ HeifOutput::write_scanline(int y, int /*z*/, TypeDesc format, const void* data,
 #else
     int hystride = 0;
 #endif
+    const heif_channel hchannel = (m_spec.nchannels == 1)
+                                      ? heif_channel_Y
+                                      : heif_channel_interleaved;
 #if LIBHEIF_NUMERIC_VERSION >= MAKE_LIBHEIF_VERSION(1, 20, 2, 0)
-    uint8_t* hdata = m_himage.get_plane2(heif_channel_interleaved, &hystride);
+    uint8_t* hdata = m_himage.get_plane2(hchannel, &hystride);
 #else
-    uint8_t* hdata = m_himage.get_plane(heif_channel_interleaved, &hystride);
+    uint8_t* hdata = m_himage.get_plane(hchannel, &hystride);
 #endif
     hdata += hystride * (y - m_spec.y);
     if (m_bitdepth == 10 || m_bitdepth == 12) {
diff --git a/testsuite/heif/ref/out-libheif1.12-orient.txt b/testsuite/heif/ref/out-libheif1.12-orient.txt
index 875e3ac54e..2765da7eda 100644
--- a/testsuite/heif/ref/out-libheif1.12-orient.txt
+++ b/testsuite/heif/ref/out-libheif1.12-orient.txt
@@ -182,3 +182,17 @@ Reading ../oiio-images/heif/sewing-threads.heic
     GPS:Longitude: 1, 49, 34.0187
     GPS:LongitudeRef: "E"
     oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-8bit.avif
+mono-8bit.avif       :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-10bit.avif
+mono-10bit.avif      :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
diff --git a/testsuite/heif/ref/out-libheif1.21-with-av1.txt b/testsuite/heif/ref/out-libheif1.21-with-av1.txt
index b22dcca2c5..7843a32b31 100644
--- a/testsuite/heif/ref/out-libheif1.21-with-av1.txt
+++ b/testsuite/heif/ref/out-libheif1.21-with-av1.txt
@@ -162,3 +162,15 @@ Reading ../oiio-images/heif/sewing-threads.heic
     GPS:Longitude: 1, 49, 34.0187
     GPS:LongitudeRef: "E"
     oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-8bit.avif
+mono-8bit.avif       :   64 x   64, 1 channel, uint10 heif
+    SHA-1: 09BE4368A01BE26600CA54D797477ABC5A37CB7B
+    channel list: Y
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-10bit.avif
+mono-10bit.avif      :   64 x   64, 1 channel, uint10 heif
+    SHA-1: 09BE4368A01BE26600CA54D797477ABC5A37CB7B
+    channel list: Y
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
diff --git a/testsuite/heif/ref/out-libheif1.21.txt b/testsuite/heif/ref/out-libheif1.21.txt
new file mode 100644
index 0000000000..5a89c9e2ae
--- /dev/null
+++ b/testsuite/heif/ref/out-libheif1.21.txt
@@ -0,0 +1,196 @@
+Reading ref/IMG_7702_small.heic
+ref/IMG_7702_small.heic :  512 x  300, 3 channel, uint8 heif
+    SHA-1: 2380C124F8338910013FEA75C9C64C23567A3156
+    channel list: R, G, B
+    DateTime: "2019:01:21 16:10:54"
+    ExposureTime: 0.030303
+    FNumber: 1.8
+    Make: "Apple"
+    Model: "iPhone 7"
+    Orientation: 1 (normal)
+    ResolutionUnit: 2 (inches)
+    Software: "12.1.2"
+    XResolution: 72
+    YResolution: 72
+    Exif:ApertureValue: 1.69599 (f/1.8)
+    Exif:BrightnessValue: 3.99501
+    Exif:ColorSpace: 65535
+    Exif:DateTimeDigitized: "2019:01:21 16:10:54"
+    Exif:DateTimeOriginal: "2019:01:21 16:10:54"
+    Exif:ExifVersion: "0221"
+    Exif:ExposureBiasValue: 0
+    Exif:ExposureMode: 0 (auto)
+    Exif:ExposureProgram: 2 (normal program)
+    Exif:Flash: 24 (no flash, auto flash)
+    Exif:FlashPixVersion: "0100"
+    Exif:FocalLength: 3.99 (3.99 mm)
+    Exif:FocalLengthIn35mmFilm: 28
+    Exif:LensMake: "Apple"
+    Exif:LensModel: "iPhone 7 back camera 3.99mm f/1.8"
+    Exif:LensSpecification: 3.99, 3.99, 1.8, 1.8
+    Exif:MeteringMode: 5 (pattern)
+    Exif:PhotographicSensitivity: 20
+    Exif:PixelXDimension: 4032
+    Exif:PixelYDimension: 3024
+    Exif:SceneCaptureType: 0 (standard)
+    Exif:SensingMethod: 2 (1-chip color area)
+    Exif:ShutterSpeedValue: 5.03599 (1/32 s)
+    Exif:SubsecTimeDigitized: "006"
+    Exif:SubsecTimeOriginal: "006"
+    Exif:WhiteBalance: 0 (auto)
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading ref/Chimera-AV1-8bit-162.avif
+ref/Chimera-AV1-8bit-162.avif :  480 x  270, 3 channel, uint8 heif
+    SHA-1: F8FDAF1BD56A21E3AF99CF8EE7FA45434D2826C7
+    channel list: R, G, B
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading ref/test-10bit.avif
+ref/test-10bit.avif  :   16 x   16, 4 channel, uint10 heif
+    SHA-1: A217653C4E10FEBF080E26F9FC78F572184B1FDA
+    channel list: R, G, B, A
+    Software: "OpenImageIO 3.2.0.0dev : B4BD496D92983E84F1FD621682CAB821C1E2126C"
+    Exif:ExifVersion: "0230"
+    Exif:FlashPixVersion: "0100"
+    Exif:ImageHistory: "oiiotool --pattern fill:topleft=1,0,0,1:topright=0,1,0,1:bottomleft=0,0,1,1:bottomright=1,1,1,1 16x16 4 -d uint16 -o test16.png"
+    heif:UnassociatedAlpha: 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading cicp_pq.avif
+cicp_pq.avif         :   16 x   16, 4 channel, uint10 heif
+    SHA-1: 0F3CAB52D479BC23E9C981DBADDFEF1F792E5540
+    channel list: R, G, B, A
+    CICP: 9, 16, 9, 1
+    Exif:ExifVersion: "0230"
+    Exif:FlashPixVersion: "0100"
+    heif:UnassociatedAlpha: 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "pq_rec2020_display"
+Reading colorspace_hlg.avif
+colorspace_hlg.avif  :   16 x   16, 4 channel, uint10 heif
+    SHA-1: 0F3CAB52D479BC23E9C981DBADDFEF1F792E5540
+    channel list: R, G, B, A
+    CICP: 9, 18, 9, 1
+    Exif:ExifVersion: "0230"
+    Exif:FlashPixVersion: "0100"
+    heif:UnassociatedAlpha: 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "hlg_rec2020_display"
+Reading ../oiio-images/heif/greyhounds-looking-for-a-table.heic
+../oiio-images/heif/greyhounds-looking-for-a-table.heic : 3024 x 4032, 3 channel, uint8 heif
+    SHA-1: 8064B23A1A995B0D6525AFB5248EEC6C730BBB6C
+    channel list: R, G, B
+    DateTime: "2023:09:28 09:44:03"
+    ExposureTime: 0.0135135
+    FNumber: 2.4
+    Make: "Apple"
+    Model: "iPhone 12 Pro"
+    Orientation: 1 (normal)
+    ResolutionUnit: 2 (inches)
+    Software: "16.7"
+    XResolution: 72
+    YResolution: 72
+    Exif:ApertureValue: 2.52607 (f/2.4)
+    Exif:BrightnessValue: 2.7506
+    Exif:ColorSpace: 65535
+    Exif:CompositeImage: 2
+    Exif:DateTimeDigitized: "2023:09:28 09:44:03"
+    Exif:DateTimeOriginal: "2023:09:28 09:44:03"
+    Exif:DigitalZoomRatio: 1.3057
+    Exif:ExifVersion: "0232"
+    Exif:ExposureBiasValue: 0
+    Exif:ExposureMode: 0 (auto)
+    Exif:ExposureProgram: 2 (normal program)
+    Exif:Flash: 16 (no flash, flash suppression)
+    Exif:FocalLength: 1.54 (1.54 mm)
+    Exif:FocalLengthIn35mmFilm: 17
+    Exif:LensMake: "Apple"
+    Exif:LensModel: "iPhone 12 Pro back triple camera 1.54mm f/2.4"
+    Exif:LensSpecification: 1.54, 6, 1.6, 2.4
+    Exif:MeteringMode: 5 (pattern)
+    Exif:OffsetTime: "+02:00"
+    Exif:OffsetTimeDigitized: "+02:00"
+    Exif:OffsetTimeOriginal: "+02:00"
+    Exif:PhotographicSensitivity: 320
+    Exif:PixelXDimension: 4032
+    Exif:PixelYDimension: 3024
+    Exif:SensingMethod: 2 (1-chip color area)
+    Exif:ShutterSpeedValue: 6.20983 (1/74 s)
+    Exif:SubsecTimeDigitized: "886"
+    Exif:SubsecTimeOriginal: "886"
+    Exif:WhiteBalance: 0 (auto)
+    GPS:Altitude: 3.24105 (3.24105 m)
+    GPS:AltitudeRef: 0 (above sea level)
+    GPS:DateStamp: "2023:09:28"
+    GPS:DestBearing: 90.2729
+    GPS:DestBearingRef: "T" (true north)
+    GPS:HPositioningError: 5.1893
+    GPS:ImgDirection: 90.2729
+    GPS:ImgDirectionRef: "T" (true north)
+    GPS:Latitude: 41, 50, 58.43
+    GPS:LatitudeRef: "N"
+    GPS:Longitude: 3, 7, 31.98
+    GPS:LongitudeRef: "E"
+    GPS:Speed: 0.171966
+    GPS:SpeedRef: "K" (km/hour)
+    oiio:ColorSpace: "srgb_rec709_scene"
+    oiio:OriginalOrientation: 6
+Reading ../oiio-images/heif/sewing-threads.heic
+../oiio-images/heif/sewing-threads.heic : 4000 x 3000, 3 channel, uint8 heif
+    SHA-1: 44551A0A8AADD2C71B504681F2BAE3F7863EF9B9
+    channel list: R, G, B
+    DateTime: "2023:12:12 18:39:16"
+    ExposureTime: 0.04
+    FNumber: 1.8
+    Make: "samsung"
+    Model: "SM-A326B"
+    Orientation: 1 (normal)
+    ResolutionUnit: 2 (inches)
+    Software: "A326BXXS8CWK2"
+    XResolution: 72
+    YResolution: 72
+    Exif:ApertureValue: 1.69 (f/1.8)
+    Exif:BrightnessValue: 1.19
+    Exif:ColorSpace: 1
+    Exif:DateTimeDigitized: "2023:12:12 18:39:16"
+    Exif:DateTimeOriginal: "2023:12:12 18:39:16"
+    Exif:DigitalZoomRatio: 1
+    Exif:ExifVersion: "0220"
+    Exif:ExposureBiasValue: 0
+    Exif:ExposureMode: 0 (auto)
+    Exif:ExposureProgram: 2 (normal program)
+    Exif:Flash: 0 (no flash)
+    Exif:FocalLength: 4.6 (4.6 mm)
+    Exif:FocalLengthIn35mmFilm: 25
+    Exif:MaxApertureValue: 1.69 (f/1.8)
+    Exif:MeteringMode: 2 (center-weighted average)
+    Exif:OffsetTime: "+01:00"
+    Exif:OffsetTimeOriginal: "+01:00"
+    Exif:PhotographicSensitivity: 500
+    Exif:PixelXDimension: 4000
+    Exif:PixelYDimension: 3000
+    Exif:SceneCaptureType: 0 (standard)
+    Exif:ShutterSpeedValue: 0.04 (1/1 s)
+    Exif:SubsecTime: "576"
+    Exif:SubsecTimeDigitized: "576"
+    Exif:SubsecTimeOriginal: "576"
+    Exif:WhiteBalance: 0 (auto)
+    Exif:YCbCrPositioning: 1
+    GPS:Altitude: 292 (292 m)
+    GPS:AltitudeRef: 0 (above sea level)
+    GPS:Latitude: 41, 43, 33.821
+    GPS:LatitudeRef: "N"
+    GPS:Longitude: 1, 49, 34.0187
+    GPS:LongitudeRef: "E"
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-8bit.avif
+mono-8bit.avif       :   64 x   64, 1 channel, uint10 heif
+    SHA-1: 09BE4368A01BE26600CA54D797477ABC5A37CB7B
+    channel list: Y
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-10bit.avif
+mono-10bit.avif      :   64 x   64, 1 channel, uint10 heif
+    SHA-1: 09BE4368A01BE26600CA54D797477ABC5A37CB7B
+    channel list: Y
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
diff --git a/testsuite/heif/ref/out-libheif1.4.txt b/testsuite/heif/ref/out-libheif1.4.txt
index 9d8304f14a..457e6045f2 100644
--- a/testsuite/heif/ref/out-libheif1.4.txt
+++ b/testsuite/heif/ref/out-libheif1.4.txt
@@ -182,3 +182,17 @@ Reading ../oiio-images/heif/sewing-threads.heic
     GPS:Longitude: 1, 49, 34.0187
     GPS:LongitudeRef: "E"
     oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-8bit.avif
+mono-8bit.avif       :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-10bit.avif
+mono-10bit.avif      :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
diff --git a/testsuite/heif/ref/out-libheif1.5.txt b/testsuite/heif/ref/out-libheif1.5.txt
index 9dfd4ff23d..b3dbaf12a3 100644
--- a/testsuite/heif/ref/out-libheif1.5.txt
+++ b/testsuite/heif/ref/out-libheif1.5.txt
@@ -182,3 +182,17 @@ Reading ../oiio-images/heif/sewing-threads.heic
     GPS:Longitude: 1, 49, 34.0187
     GPS:LongitudeRef: "E"
     oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-8bit.avif
+mono-8bit.avif       :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-10bit.avif
+mono-10bit.avif      :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
diff --git a/testsuite/heif/ref/out-libheif1.9-alt2.txt b/testsuite/heif/ref/out-libheif1.9-alt2.txt
index f6448d4836..a36c6b8a63 100644
--- a/testsuite/heif/ref/out-libheif1.9-alt2.txt
+++ b/testsuite/heif/ref/out-libheif1.9-alt2.txt
@@ -146,3 +146,17 @@ Reading ../oiio-images/heif/sewing-threads.heic
     GPS:Longitude: 1, 49, 34.0187
     GPS:LongitudeRef: "E"
     oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-8bit.avif
+mono-8bit.avif       :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-10bit.avif
+mono-10bit.avif      :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
diff --git a/testsuite/heif/ref/out-libheif1.9-with-av1-alt2.txt b/testsuite/heif/ref/out-libheif1.9-with-av1-alt2.txt
index c938a6fe73..36ca82c6f4 100644
--- a/testsuite/heif/ref/out-libheif1.9-with-av1-alt2.txt
+++ b/testsuite/heif/ref/out-libheif1.9-with-av1-alt2.txt
@@ -182,3 +182,17 @@ Reading ../oiio-images/heif/sewing-threads.heic
     GPS:Longitude: 1, 49, 34.0187
     GPS:LongitudeRef: "E"
     oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-8bit.avif
+mono-8bit.avif       :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-10bit.avif
+mono-10bit.avif      :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
diff --git a/testsuite/heif/ref/out-libheif1.9-with-av1.txt b/testsuite/heif/ref/out-libheif1.9-with-av1.txt
index f6d7ca55a5..bf56e7c97b 100644
--- a/testsuite/heif/ref/out-libheif1.9-with-av1.txt
+++ b/testsuite/heif/ref/out-libheif1.9-with-av1.txt
@@ -182,3 +182,17 @@ Reading ../oiio-images/heif/sewing-threads.heic
     GPS:Longitude: 1, 49, 34.0187
     GPS:LongitudeRef: "E"
     oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-8bit.avif
+mono-8bit.avif       :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-10bit.avif
+mono-10bit.avif      :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
diff --git a/testsuite/heif/ref/out-libheif1.9.txt b/testsuite/heif/ref/out-libheif1.9.txt
index 2778c33493..0772246c63 100644
--- a/testsuite/heif/ref/out-libheif1.9.txt
+++ b/testsuite/heif/ref/out-libheif1.9.txt
@@ -146,3 +146,17 @@ Reading ../oiio-images/heif/sewing-threads.heic
     GPS:Longitude: 1, 49, 34.0187
     GPS:LongitudeRef: "E"
     oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-8bit.avif
+mono-8bit.avif       :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
+Reading mono-10bit.avif
+mono-10bit.avif      :   64 x   64, 3 channel, uint10 heif
+    SHA-1: 4E361351029D39379C73580BCD4C9859E4B73ADE
+    channel list: R, G, B
+    CICP: 2, 2, 6, 1
+    oiio:BitsPerSample: 10
+    oiio:ColorSpace: "srgb_rec709_scene"
diff --git a/testsuite/heif/run.py b/testsuite/heif/run.py
index 0a3250e374..5d5945d870 100755
--- a/testsuite/heif/run.py
+++ b/testsuite/heif/run.py
@@ -22,5 +22,11 @@
 for f in files:
     command = command + info_command (os.path.join(OIIO_TESTSUITE_IMAGEDIR, f))
 
+command += oiiotool("--pattern checker:color1=1:color2=0 64x64 1 -o mono-8bit.avif")
+command += info_command("mono-8bit.avif", safematch=True)
+
+command += oiiotool("--pattern checker:color1=1:color2=0 64x64 1 -d uint10 -o mono-10bit.avif")
+command += info_command("mono-10bit.avif", safematch=True)
+
 # avif conversion is expected to fail if libheif is built without AV1 support
 failureok = 1

From 1e22b6c7d67de581bb210f7f93a5e80543791ba5 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 14 Feb 2026 04:59:46 -0800
Subject: [PATCH 57/70] fix(tiff): Correctly read TIFF EXIF fields for
 ExifVersion and FlashPixVersion (#5045)

This allows us to correctly read the ExifVersion and FlashPixVersion
metadata in an EXIF block of a TIFF file.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/exif.cpp                 | 10 +++--
 src/tiff.imageio/tiffinput.cpp              | 45 ++++++++++++++++++---
 src/tiff.imageio/tiffoutput.cpp             | 10 +++--
 testsuite/tiff-suite/ref/out-alt.txt        |  4 ++
 testsuite/tiff-suite/ref/out-alt2.txt       |  4 ++
 testsuite/tiff-suite/ref/out-jpeg9b.txt     |  4 ++
 testsuite/tiff-suite/ref/out-jpeg9d-alt.txt |  4 ++
 testsuite/tiff-suite/ref/out.txt            |  4 ++
 8 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/src/libOpenImageIO/exif.cpp b/src/libOpenImageIO/exif.cpp
index ab95d881ef..c418dec1a0 100644
--- a/src/libOpenImageIO/exif.cpp
+++ b/src/libOpenImageIO/exif.cpp
@@ -260,13 +260,15 @@ print_dir_entry(std::ostream& out, const TagMap& tagmap,
 
     switch (dir.tdir_type) {
     case TIFF_ASCII:
+#    ifdef EXIF_TIFF_UTF8
     case EXIF_TIFF_UTF8:
+#    endif
         OIIO::print(out, "'{}'", string_view(mydata, dir.tdir_count));
         break;
     case TIFF_RATIONAL: {
         const unsigned int* u = (unsigned int*)mydata;
         for (size_t i = 0; i < dir.tdir_count; ++i)
-            OIIO::print(out, "{}/{} = {} ", u[2 * i], << u[2 * i + 1],
+            OIIO::print(out, "{}/{} = {} ", u[2 * i], u[2 * i + 1],
                         (double)u[2 * i] / (double)u[2 * i + 1]);
     } break;
     case TIFF_SRATIONAL: {
@@ -449,7 +451,7 @@ static const TagInfo exif_tag_table[] = {
     { EXIF_SPECTRALSENSITIVITY,"Exif:SpectralSensitivity",	TIFF_ASCII, 0 },
     { EXIF_ISOSPEEDRATINGS,	"Exif:ISOSpeedRatings",	TIFF_SHORT, 1 },
     { EXIF_OECF,	        "Exif:OECF",	TIFF_NOTYPE, 1 },	 // skip it
-    { EXIF_EXIFVERSION,	"Exif:ExifVersion",	TIFF_UNDEFINED, 1, version4char_handler },	 // skip it
+    { EXIF_EXIFVERSION,	"Exif:ExifVersion",	TIFF_UNDEFINED, 1, version4char_handler },
     { EXIF_DATETIMEORIGINAL,	"Exif:DateTimeOriginal",	TIFF_ASCII, 0 },
     { EXIF_DATETIMEDIGITIZED,"Exif:DateTimeDigitized",   TIFF_ASCII, 0 },
     { EXIF_OFFSETTIME,"Exif:OffsetTime",   TIFF_ASCII, 0 },
@@ -475,7 +477,7 @@ static const TagInfo exif_tag_table[] = {
     { EXIF_SUBSECTIME,	"Exif:SubsecTime",	        TIFF_ASCII, 0 },
     { EXIF_SUBSECTIMEORIGINAL,"Exif:SubsecTimeOriginal",	TIFF_ASCII, 0 },
     { EXIF_SUBSECTIMEDIGITIZED,"Exif:SubsecTimeDigitized",	TIFF_ASCII, 0 },
-    { EXIF_FLASHPIXVERSION,	"Exif:FlashPixVersion",	TIFF_UNDEFINED, 1, version4char_handler },	// skip "Exif:FlashPixVesion",	TIFF_NOTYPE, 1 },
+    { EXIF_FLASHPIXVERSION,	"Exif:FlashPixVersion",	TIFF_UNDEFINED, 1, version4char_handler },
     { EXIF_COLORSPACE,	"Exif:ColorSpace",	TIFF_SHORT, 1 },
     { EXIF_PIXELXDIMENSION,	"Exif:PixelXDimension",	TIFF_LONG, 1 },
     { EXIF_PIXELYDIMENSION,	"Exif:PixelYDimension",	TIFF_LONG, 1 },
@@ -1202,7 +1204,7 @@ decode_exif(cspan<uint8_t> exif, ImageSpec& spec)
 
 #if DEBUG_EXIF_READ
     std::cerr << "Exif dump:\n";
-    for (size_t i = 0; i < std::min(200L, exif.size()); ++i) {
+    for (size_t i = 0; i < std::min(200UL, exif.size()); ++i) {
         if ((i % 16) == 0)
             std::cerr << "[" << i << "] ";
         if (exif[i] >= ' ')
diff --git a/src/tiff.imageio/tiffinput.cpp b/src/tiff.imageio/tiffinput.cpp
index 06d1394424..8233de9284 100644
--- a/src/tiff.imageio/tiffinput.cpp
+++ b/src/tiff.imageio/tiffinput.cpp
@@ -280,7 +280,8 @@ class TIFFInput final : public ImageInput {
 
     OIIO_NODISCARD
     bool safe_tiffgetfield(string_view name OIIO_MAYBE_UNUSED, int tag,
-                           TypeDesc expected, void* dest)
+                           TypeDesc expected, void* dest,
+                           const uint32_t* count = nullptr)
     {
         TypeDesc type = tiffgetfieldtype(tag);
         // Caller expects a specific type and the tag doesn't match? Punt.
@@ -295,6 +296,11 @@ class TIFFInput final : public ImageInput {
         int readcount = TIFFFieldReadCount(field);
         if (!passcount && readcount > 0) {
             return TIFFGetField(m_tif, tag, dest);
+        } else if (passcount && readcount <= 0) {
+            uint32_t mycount = 0;
+            if (!count)
+                count = &mycount;
+            return TIFFGetField(m_tif, tag, count, dest);
         }
         // OIIO::debugfmt(" stgf {} tag {} {} datatype {} passcount {} readcount {}\n",
         //                name, tag, type, int(TIFFFieldDataType(field)), passcount, readcount);
@@ -396,21 +402,48 @@ class TIFFInput final : public ImageInput {
     // add it in the obvious way to m_spec under the name 'oiioname'.
     void find_tag(int tifftag, TIFFDataType tifftype, string_view oiioname)
     {
+        if (tifftype == TIFF_NOTYPE)
+            return;  // NOTYPE is a signal that should skip it
         auto info = find_field(tifftag, tifftype);
         if (!info) {
             // Something has gone wrong, libtiff doesn't think the field type
             // is the same as we do.
             return;
         }
-        if (tifftype == TIFF_ASCII)
+        tifftype  = TIFFFieldDataType(info);
+        int count = TIFFFieldReadCount(info);
+        if (tifftype == TIFF_ASCII) {
             get_string_attribute(oiioname, tifftag);
-        else if (tifftype == TIFF_SHORT)
+            return;
+        } else if (tifftype == TIFF_SHORT) {
             get_short_attribute(oiioname, tifftag);
-        else if (tifftype == TIFF_LONG)
+            return;
+        } else if (tifftype == TIFF_LONG) {
             get_int_attribute(oiioname, tifftag);
-        else if (tifftype == TIFF_RATIONAL || tifftype == TIFF_SRATIONAL
-                 || tifftype == TIFF_FLOAT || tifftype == TIFF_DOUBLE)
+            return;
+        } else if (tifftype == TIFF_RATIONAL || tifftype == TIFF_SRATIONAL
+                   || tifftype == TIFF_FLOAT || tifftype == TIFF_DOUBLE) {
             get_float_attribute(oiioname, tifftag);
+            return;
+        }
+        // special cases follow
+        if (tifftype == TIFF_UNDEFINED) {
+            if ((tifftag == EXIF_EXIFVERSION || tifftag == EXIF_FLASHPIXVERSION)
+                && count == 4) {
+                char* ptr = nullptr;
+                if (safe_tiffgetfield(oiioname, tifftag, TypeUnknown, &ptr)
+                    && ptr && ptr[0]) {
+                    std::string str(ptr, 4);
+                    m_spec.attribute(oiioname, str);
+                }
+                return;
+            }
+        }
+#if 0
+        print("Unhandled TIFF tag {} type {} count {} pass {} for {}\n",
+              tifftag, int(tifftype), count, TIFFFieldPassCount(info),
+              oiioname);
+#endif
     }
 
     // If we're at scanline y, where does the next strip start?
diff --git a/src/tiff.imageio/tiffoutput.cpp b/src/tiff.imageio/tiffoutput.cpp
index 5852d7d810..eb0291f1f1 100644
--- a/src/tiff.imageio/tiffoutput.cpp
+++ b/src/tiff.imageio/tiffoutput.cpp
@@ -1202,16 +1202,18 @@ TIFFOutput::write_exif_data()
                 handled = true;
             }
             if (!handled) {
-#    ifndef NDEBUG
+#    if 0
                 print("Unhandled EXIF {} ({}) / tag {} tifftype {} count {}\n",
                       p.name(), p.type(), tag, tifftype, count);
 #    endif
             }
             // NOTE: We are not handling arrays of values, just scalars.
             if (!ok) {
-                // print(
-                //     "Error handling EXIF {} ({}) / tag {} tifftype {} count {}\n",
-                //     p.name(), p.type(), tag, tifftype, count);
+#    if 0
+                print(
+                    "Error handling EXIF {} ({}) / tag {} tifftype {} count {}\n",
+                    p.name(), p.type(), tag, tifftype, count);
+#    endif
             }
         }
     }
diff --git a/testsuite/tiff-suite/ref/out-alt.txt b/testsuite/tiff-suite/ref/out-alt.txt
index 20a7ce2efb..5a65b8c5b9 100644
--- a/testsuite/tiff-suite/ref/out-alt.txt
+++ b/testsuite/tiff-suite/ref/out-alt.txt
@@ -68,9 +68,11 @@ Reading ../oiio-images/libtiffpic/dscf0013.tif
     Exif:ColorSpace: 1
     Exif:DateTimeDigitized: "2004:11:10 00:00:31"
     Exif:DateTimeOriginal: "2004:11:10 00:00:31"
+    Exif:ExifVersion: "0210"
     Exif:ExposureBiasValue: 0
     Exif:ExposureProgram: 2 (normal program)
     Exif:Flash: 1 (flash fired)
+    Exif:FlashPixVersion: "0100"
     Exif:FocalLength: 7.4 (7.4 mm)
     Exif:FocalPlaneResolutionUnit: 3 (cm)
     Exif:FocalPlaneXResolution: 847
@@ -238,10 +240,12 @@ Reading ../oiio-images/libtiffpic/pc260001.tif
     Exif:DateTimeDigitized: "2005:12:26 17:09:35"
     Exif:DateTimeOriginal: "2005:12:26 17:09:35"
     Exif:DigitalZoomRatio: 0
+    Exif:ExifVersion: "0221"
     Exif:ExposureBiasValue: 0
     Exif:ExposureMode: 0 (auto)
     Exif:ExposureProgram: 2 (normal program)
     Exif:Flash: 89 (flash fired, auto flash, red-eye reduction)
+    Exif:FlashPixVersion: "0100"
     Exif:FocalLength: 17.8 (17.8 mm)
     Exif:LightSource: 0 (unknown)
     Exif:MaxApertureValue: 3 (f/2.8)
diff --git a/testsuite/tiff-suite/ref/out-alt2.txt b/testsuite/tiff-suite/ref/out-alt2.txt
index 3a14b41bc5..ad3878cc23 100644
--- a/testsuite/tiff-suite/ref/out-alt2.txt
+++ b/testsuite/tiff-suite/ref/out-alt2.txt
@@ -68,9 +68,11 @@ Reading ../oiio-images/libtiffpic/dscf0013.tif
     Exif:ColorSpace: 1
     Exif:DateTimeDigitized: "2004:11:10 00:00:31"
     Exif:DateTimeOriginal: "2004:11:10 00:00:31"
+    Exif:ExifVersion: "0210"
     Exif:ExposureBiasValue: 0
     Exif:ExposureProgram: 2 (normal program)
     Exif:Flash: 1 (flash fired)
+    Exif:FlashPixVersion: "0100"
     Exif:FocalLength: 7.4 (7.4 mm)
     Exif:FocalPlaneResolutionUnit: 3 (cm)
     Exif:FocalPlaneXResolution: 847
@@ -238,10 +240,12 @@ Reading ../oiio-images/libtiffpic/pc260001.tif
     Exif:DateTimeDigitized: "2005:12:26 17:09:35"
     Exif:DateTimeOriginal: "2005:12:26 17:09:35"
     Exif:DigitalZoomRatio: 0
+    Exif:ExifVersion: "0221"
     Exif:ExposureBiasValue: 0
     Exif:ExposureMode: 0 (auto)
     Exif:ExposureProgram: 2 (normal program)
     Exif:Flash: 89 (flash fired, auto flash, red-eye reduction)
+    Exif:FlashPixVersion: "0100"
     Exif:FocalLength: 17.8 (17.8 mm)
     Exif:LightSource: 0 (unknown)
     Exif:MaxApertureValue: 3 (f/2.8)
diff --git a/testsuite/tiff-suite/ref/out-jpeg9b.txt b/testsuite/tiff-suite/ref/out-jpeg9b.txt
index 354a70c52e..494a935621 100644
--- a/testsuite/tiff-suite/ref/out-jpeg9b.txt
+++ b/testsuite/tiff-suite/ref/out-jpeg9b.txt
@@ -68,9 +68,11 @@ Reading ../oiio-images/libtiffpic/dscf0013.tif
     Exif:ColorSpace: 1
     Exif:DateTimeDigitized: "2004:11:10 00:00:31"
     Exif:DateTimeOriginal: "2004:11:10 00:00:31"
+    Exif:ExifVersion: "0210"
     Exif:ExposureBiasValue: 0
     Exif:ExposureProgram: 2 (normal program)
     Exif:Flash: 1 (flash fired)
+    Exif:FlashPixVersion: "0100"
     Exif:FocalLength: 7.4 (7.4 mm)
     Exif:FocalPlaneResolutionUnit: 3 (cm)
     Exif:FocalPlaneXResolution: 847
@@ -238,10 +240,12 @@ Reading ../oiio-images/libtiffpic/pc260001.tif
     Exif:DateTimeDigitized: "2005:12:26 17:09:35"
     Exif:DateTimeOriginal: "2005:12:26 17:09:35"
     Exif:DigitalZoomRatio: 0
+    Exif:ExifVersion: "0221"
     Exif:ExposureBiasValue: 0
     Exif:ExposureMode: 0 (auto)
     Exif:ExposureProgram: 2 (normal program)
     Exif:Flash: 89 (flash fired, auto flash, red-eye reduction)
+    Exif:FlashPixVersion: "0100"
     Exif:FocalLength: 17.8 (17.8 mm)
     Exif:LightSource: 0 (unknown)
     Exif:MaxApertureValue: 3 (f/2.8)
diff --git a/testsuite/tiff-suite/ref/out-jpeg9d-alt.txt b/testsuite/tiff-suite/ref/out-jpeg9d-alt.txt
index fb5da67d19..423c4a2218 100644
--- a/testsuite/tiff-suite/ref/out-jpeg9d-alt.txt
+++ b/testsuite/tiff-suite/ref/out-jpeg9d-alt.txt
@@ -68,9 +68,11 @@ Reading ../oiio-images/libtiffpic/dscf0013.tif
     Exif:ColorSpace: 1
     Exif:DateTimeDigitized: "2004:11:10 00:00:31"
     Exif:DateTimeOriginal: "2004:11:10 00:00:31"
+    Exif:ExifVersion: "0210"
     Exif:ExposureBiasValue: 0
     Exif:ExposureProgram: 2 (normal program)
     Exif:Flash: 1 (flash fired)
+    Exif:FlashPixVersion: "0100"
     Exif:FocalLength: 7.4 (7.4 mm)
     Exif:FocalPlaneResolutionUnit: 3 (cm)
     Exif:FocalPlaneXResolution: 847
@@ -238,10 +240,12 @@ Reading ../oiio-images/libtiffpic/pc260001.tif
     Exif:DateTimeDigitized: "2005:12:26 17:09:35"
     Exif:DateTimeOriginal: "2005:12:26 17:09:35"
     Exif:DigitalZoomRatio: 0
+    Exif:ExifVersion: "0221"
     Exif:ExposureBiasValue: 0
     Exif:ExposureMode: 0 (auto)
     Exif:ExposureProgram: 2 (normal program)
     Exif:Flash: 89 (flash fired, auto flash, red-eye reduction)
+    Exif:FlashPixVersion: "0100"
     Exif:FocalLength: 17.8 (17.8 mm)
     Exif:LightSource: 0 (unknown)
     Exif:MaxApertureValue: 3 (f/2.8)
diff --git a/testsuite/tiff-suite/ref/out.txt b/testsuite/tiff-suite/ref/out.txt
index 30a318664a..a28ac71677 100644
--- a/testsuite/tiff-suite/ref/out.txt
+++ b/testsuite/tiff-suite/ref/out.txt
@@ -68,9 +68,11 @@ Reading ../oiio-images/libtiffpic/dscf0013.tif
     Exif:ColorSpace: 1
     Exif:DateTimeDigitized: "2004:11:10 00:00:31"
     Exif:DateTimeOriginal: "2004:11:10 00:00:31"
+    Exif:ExifVersion: "0210"
     Exif:ExposureBiasValue: 0
     Exif:ExposureProgram: 2 (normal program)
     Exif:Flash: 1 (flash fired)
+    Exif:FlashPixVersion: "0100"
     Exif:FocalLength: 7.4 (7.4 mm)
     Exif:FocalPlaneResolutionUnit: 3 (cm)
     Exif:FocalPlaneXResolution: 847
@@ -238,10 +240,12 @@ Reading ../oiio-images/libtiffpic/pc260001.tif
     Exif:DateTimeDigitized: "2005:12:26 17:09:35"
     Exif:DateTimeOriginal: "2005:12:26 17:09:35"
     Exif:DigitalZoomRatio: 0
+    Exif:ExifVersion: "0221"
     Exif:ExposureBiasValue: 0
     Exif:ExposureMode: 0 (auto)
     Exif:ExposureProgram: 2 (normal program)
     Exif:Flash: 89 (flash fired, auto flash, red-eye reduction)
+    Exif:FlashPixVersion: "0100"
     Exif:FocalLength: 17.8 (17.8 mm)
     Exif:LightSource: 0 (unknown)
     Exif:MaxApertureValue: 3 (f/2.8)

From 055c618a1c25982b54be11f7553e09061702647c Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Thu, 19 Feb 2026 10:28:06 -0800
Subject: [PATCH 58/70] fix(oiiotool): Fix expression BOTTOM when there are
 exactly two images (#5046)

Fixes #5044

Oops, the logic was a little mixed up when there were exactly two
images. One reason that this was a special case is that conceptually,
there is just a stack, but the implementation is that there is a
separate variable for the top item, and then the actual stack is all the
other items.

Also add more thorough testing of TOP/BOTTOM, including what happens for
2, 1, and also 0 items on the image stack (errors in that last case).

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/oiiotool/expressions.cpp           |  2 +-
 testsuite/oiiotool-control/ref/out.txt | 16 +++++++++++++++-
 testsuite/oiiotool-control/run.py      | 19 ++++++++++++++++++-
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/src/oiiotool/expressions.cpp b/src/oiiotool/expressions.cpp
index fbb8568666..edc68fa88f 100644
--- a/src/oiiotool/expressions.cpp
+++ b/src/oiiotool/expressions.cpp
@@ -156,7 +156,7 @@ Oiiotool::express_parse_atom(const string_view expr, string_view& s,
         if (Strutil::parse_prefix(s, "TOP")) {
             img = curimg;
         } else if (Strutil::parse_prefix(s, "BOTTOM")) {
-            img = (image_stack.size() <= 1) ? curimg : image_stack[0];
+            img = image_stack.empty() ? curimg : image_stack[0];
         } else if (Strutil::parse_prefix(s, "IMG[")) {
             std::string until_bracket = Strutil::parse_until(s, "]");
             if (until_bracket.empty() || !Strutil::parse_char(s, ']')) {
diff --git a/testsuite/oiiotool-control/ref/out.txt b/testsuite/oiiotool-control/ref/out.txt
index 999e3d598b..c8fed296cf 100644
--- a/testsuite/oiiotool-control/ref/out.txt
+++ b/testsuite/oiiotool-control/ref/out.txt
@@ -1,5 +1,19 @@
-Stack holds [0] = d.tif, [1] = c.tif, [2] = b.tif
+Stack holds [0] = d.tif, [1] = c.tif, [2] = b.tif , [3] = a.tif
 TOP = d.tif, BOTTOM = a.tif
+Stack holds [0] = b.tif, [1] = a.tif
+TOP = b.tif, BOTTOM = a.tif
+Stack holds [0] = a.tif
+TOP = a.tif, BOTTOM = a.tif
+Stack is empty
+oiiotool ERROR: expression : not a valid image at char 4 of 'TOP.filename'
+Full command line was:
+> oiiotool --echo "Stack is empty" --echo "TOP = {TOP.filename}"
+TOP = TOP.filename
+Stack is empty
+oiiotool ERROR: expression : not a valid image at char 7 of 'BOTTOM.filename'
+Full command line was:
+> oiiotool --echo "Stack is empty" --echo "BOTTOM = {BOTTOM.filename}"
+BOTTOM = BOTTOM.filename
 Stack bottom to top:
   a.tif
   b.tif
diff --git a/testsuite/oiiotool-control/run.py b/testsuite/oiiotool-control/run.py
index 51c9a03f1b..c165df5fed 100755
--- a/testsuite/oiiotool-control/run.py
+++ b/testsuite/oiiotool-control/run.py
@@ -23,9 +23,26 @@
 # Test TOP, BOTTOM, IMG[]
 # TOP should be c.tif, BOTTOM should be a.tif
 command += oiiotool ("a.tif b.tif c.tif d.tif " +
-                     "--echo \"Stack holds [0] = {IMG[0].filename}, [1] = {IMG[1].filename}, [2] = {IMG[2].filename}\" " +
+                     "--echo \"Stack holds [0] = {IMG[0].filename}, [1] = {IMG[1].filename}, [2] = {IMG[2].filename} , [3] = {IMG[3].filename}\" " +
+                     "--echo \"TOP = {TOP.filename}, BOTTOM = {BOTTOM.filename}\" "
+                     )
+# Regression test (Issue #5044): make sure BOTTOM works correctly for 0-2 images
+command += oiiotool ("a.tif b.tif " +
+                     "--echo \"Stack holds [0] = {IMG[0].filename}, [1] = {IMG[1].filename}\" " +
+                     "--echo \"TOP = {TOP.filename}, BOTTOM = {BOTTOM.filename}\" "
+                     )
+command += oiiotool ("a.tif " +
+                     "--echo \"Stack holds [0] = {IMG[0].filename}\" " +
                      "--echo \"TOP = {TOP.filename}, BOTTOM = {BOTTOM.filename}\" "
                      )
+# Empty -- should get an error about TOP and BOTTOM not being available
+command += oiiotool ("--echo \"Stack is empty\" " +
+                     "--echo \"TOP = {TOP.filename}\" "
+                     )
+command += oiiotool ("--echo \"Stack is empty\" " +
+                     "--echo \"BOTTOM = {BOTTOM.filename}\" "
+                     )
+
 # Test --pop, --popbottom, --stackreverse, --stackclear, --stackextract
 command += oiiotool (
       "a.tif b.tif c.tif d.tif "

From 03dddc1178a03afd40c409cbb762d8ae5654d02c Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Thu, 19 Feb 2026 22:47:59 -0800
Subject: [PATCH 59/70] build: self-builder logic fixes for deep vs shallow
 clones (#5034)

* cmake utility build_dependency_with_cmake was unconditionally doing a
shallow clone and using `clone -b`, but that only works if it's got a
branch or tag name, not if it has a commit hash. So change the logic so
it does a shallow clone only if GIT_TAG is specified but GIT_COMMIT is
not.
* pybind11 self-builder is modified to allow a git commit override.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/cmake/build_pybind11.cmake   |  7 +++++--
 src/cmake/dependency_utils.cmake | 10 +++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/cmake/build_pybind11.cmake b/src/cmake/build_pybind11.cmake
index e7a6f3e282..b6c827f764 100644
--- a/src/cmake/build_pybind11.cmake
+++ b/src/cmake/build_pybind11.cmake
@@ -8,9 +8,12 @@
 
 set_cache (pybind11_BUILD_VERSION 3.0.1 "pybind11 version for local builds")
 set (pybind11_GIT_REPOSITORY "https://github.com/pybind/pybind11")
-set (pybind11_GIT_TAG "v${pybind11_BUILD_VERSION}")
+set_cache (pybind11_GIT_TAG "v${pybind11_BUILD_VERSION}"
+           "pybind11 git tag to checkout")
+set_cache (pybind11_GIT_COMMIT ""
+           "pybind11 specific commit to checkout (overrides tag if set)")
 set_cache (pybind11_BUILD_SHARED_LIBS ${LOCAL_BUILD_SHARED_LIBS_DEFAULT}
-           DOC "Should a local pybind11 build, if necessary, build shared libraries" ADVANCED)
+           "Should a local pybind11 build, if necessary, build shared libraries" ADVANCED)
 
 string (MAKE_C_IDENTIFIER ${pybind11_BUILD_VERSION} pybind11_VERSION_IDENT)
 
diff --git a/src/cmake/dependency_utils.cmake b/src/cmake/dependency_utils.cmake
index 25a1172555..c6468663c1 100644
--- a/src/cmake/dependency_utils.cmake
+++ b/src/cmake/dependency_utils.cmake
@@ -609,7 +609,7 @@ macro (build_dependency_with_cmake pkgname)
         # noValueKeywords:
         "NOINSTALL"
         # singleValueKeywords:
-        "GIT_REPOSITORY;GIT_TAG;GIT_COMMIT;VERSION;SOURCE_SUBDIR;GIT_SHALLOW;QUIET"
+        "GIT_REPOSITORY;GIT_TAG;GIT_COMMIT;VERSION;SOURCE_SUBDIR;QUIET"
         # multiValueKeywords:
         "CMAKE_ARGS"
         # argsToParse:
@@ -629,8 +629,10 @@ macro (build_dependency_with_cmake pkgname)
 
     unset (${pkgname}_GIT_CLONE_ARGS)
     unset (_pkg_exec_quiet)
-    if (_pkg_GIT_SHALLOW OR "${_pkg_GIT_SHALLOW}" STREQUAL "")
-        list (APPEND ${pkgname}_GIT_CLONE_ARGS --depth 1)
+    if (NOT "${pkg_GIT_TAG}" STREQUAL "" AND "${_pkg_GIT_COMMIT}" STREQUAL "")
+        # If a tag was specified, but not a specific commit, do a shallow
+        # clone.
+        list (APPEND ${pkgname}_GIT_CLONE_ARGS -b ${pkg_GIT_TAG} --depth 1)
     endif ()
     if (_pkg_QUIET OR "${_pkg_QUIET}" STREQUAL "")
         list (APPEND ${pkgname}_GIT_CLONE_ARGS -q ERROR_VARIABLE ${pkgname}_clone_errors)
@@ -641,12 +643,10 @@ macro (build_dependency_with_cmake pkgname)
     find_package (Git REQUIRED)
     if (NOT IS_DIRECTORY ${${pkgname}_LOCAL_SOURCE_DIR})
         message (STATUS "COMMAND ${GIT_EXECUTABLE} clone ${_pkg_GIT_REPOSITORY} "
-                                "-b ${_pkg_GIT_TAG} "
                                 "${${pkgname}_LOCAL_SOURCE_DIR} "
                                 "${${pkgname}_GIT_CLONE_ARGS} "
                         "${_pkg_exec_quiet}")
         execute_process(COMMAND ${GIT_EXECUTABLE} clone ${_pkg_GIT_REPOSITORY}
-                                -b ${_pkg_GIT_TAG}
                                 ${${pkgname}_LOCAL_SOURCE_DIR}
                                 ${${pkgname}_GIT_CLONE_ARGS}
                         ${_pkg_exec_quiet})

From 9b58893ac5fa291f50b19ff42f0ea5f0a84f284c Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 21 Feb 2026 14:06:07 -0800
Subject: [PATCH 60/70] test: imageinout_test: add benchmark of read and write
 speed vs tile size (#5037)

For various tile sizes (and scanline), benchmark how long it takes to
read and write a 4k x 2k image.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml               |  3 +
 src/libOpenImageIO/imageinout_test.cpp | 96 +++++++++++++++++++++++---
 2 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8947ea5682..00d8d1548e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -316,6 +316,7 @@ jobs:
             container: aswf/ci-oiio:2025
             cxx_std: 17
             build_type: Debug
+            ctest_test_timeout: "240"
             python_ver: "3.11"
             simd: "avx2,f16c"
             fmt_ver: 11.2.0
@@ -697,6 +698,7 @@ jobs:
             vsver: 2022
             generator: "Visual Studio 17 2022"
             python_ver: "3.12"
+            ctest_test_timeout: "240"
             setenvs: export OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH=1
           - desc: Windows-2025 VS2022
             runner: windows-2025
@@ -704,5 +706,6 @@ jobs:
             vsver: 2022
             generator: "Visual Studio 17 2022"
             python_ver: "3.12"
+            ctest_test_timeout: "240"
             setenvs: export OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH=1
             benchmark: 1
diff --git a/src/libOpenImageIO/imageinout_test.cpp b/src/libOpenImageIO/imageinout_test.cpp
index bcf4b072b8..98cf25cc49 100644
--- a/src/libOpenImageIO/imageinout_test.cpp
+++ b/src/libOpenImageIO/imageinout_test.cpp
@@ -101,8 +101,8 @@ make_test_image(string_view formatname)
 
 static bool
 checked_write(ImageOutput* out, string_view filename, const ImageSpec& spec,
-              TypeDesc type, const void* data, bool do_asserts = true,
-              std::string* errmsg          = nullptr,
+              TypeDesc type, image_span<const std::byte> data,
+              bool do_asserts = true, std::string* errmsg = nullptr,
               Filesystem::IOProxy* ioproxy = nullptr)
 {
     if (errmsg)
@@ -117,7 +117,7 @@ checked_write(ImageOutput* out, string_view filename, const ImageSpec& spec,
         if (errmsg)
             *errmsg = OIIO::geterror();
         else
-            std::cout << "      " << OIIO::geterror() << "\n";
+            print("      {}\n", OIIO::geterror());
         return false;
     }
 
@@ -131,19 +131,27 @@ checked_write(ImageOutput* out, string_view filename, const ImageSpec& spec,
 
 static bool
 checked_read(ImageInput* in, string_view filename,
-             std::vector<unsigned char>& data, bool already_opened = false,
-             bool do_asserts = true, std::string* errmsg = nullptr)
+             std::vector<unsigned char>& data, TypeDesc datatype = TypeFloat,
+             bool already_opened = false, bool do_asserts = true,
+             std::string* errmsg = nullptr)
 {
     if (errmsg)
         *errmsg = "";
+    std::unique_ptr<ImageInput> in_local;
+    if (!in) {
+        in_local       = ImageInput::create(filename);
+        in             = in_local.get();
+        already_opened = false;
+    }
+    OIIO_CHECK_ASSERT(in && "Failed to create input");
     if (!already_opened) {
         ImageSpec spec;
         CHECKED(in, open(filename, spec));
     }
     data.resize(in->spec().image_pixels() * in->spec().nchannels
-                * sizeof(float));
+                * datatype.size());
     CHECKED(in,
-            read_image(0, 0, 0, in->spec().nchannels, TypeFloat, data.data()));
+            read_image(0, 0, 0, in->spec().nchannels, datatype, data.data()));
     CHECKED(in, close());
     return true;
 }
@@ -164,7 +172,8 @@ test_write_proxy(string_view formatname, string_view extension,
     // Use ImageOutput.write_image interface to write to outproxy
     Filesystem::IOVecOutput outproxy;
     ok = checked_write(nullptr, disk_filename, buf.spec(), buf.spec().format,
-                       buf.localpixels(), true, nullptr, &outproxy);
+                       buf.localpixels_as_writable_byte_image_span(), true,
+                       nullptr, &outproxy);
 
     // Use ImageBuf.write interface to write to outproxybuf
     Filesystem::IOVecOutput outproxybuf;
@@ -281,7 +290,7 @@ test_read_proxy(string_view formatname, string_view extension,
     OIIO_CHECK_ASSERT(in && "Failed to open input with proxy");
     if (in) {
         std::vector<unsigned char> readpixels;
-        ok &= checked_read(in.get(), memname, readpixels, true);
+        ok &= checked_read(in.get(), memname, readpixels, TypeFloat, true);
         OIIO_ASSERT(readpixels.size() == nvalues * sizeof(float));
         ok &= test_pixel_match({ (const float*)readpixels.data(), nvalues },
                                { (const float*)buf.localpixels(), nvalues },
@@ -331,7 +340,8 @@ test_write_unwritable(string_view extension, const ImageBuf& buf)
     if (badout) {
         std::string errmsg;
         ok = checked_write(badout.get(), bad_filename, buf.spec(),
-                           buf.spec().format, buf.localpixels(),
+                           buf.spec().format,
+                           buf.localpixels_as_byte_image_span(),
                            /*do_asserts=*/false, &errmsg);
         if (!ok)
             std::cout << term.ansi("green", "OK") << " ("
@@ -391,7 +401,7 @@ test_all_formats()
 
         std::cout << "    Writing " << filename << " ... ";
         ok = checked_write(out.get(), filename, buf.spec(), buf.spec().format,
-                           orig_pixels);
+                           buf.localpixels_as_writable_byte_image_span());
         if (ok)
             std::cout << term.ansi("green", "OK\n");
 
@@ -531,6 +541,68 @@ test_read_tricky_sizes()
 
 
+void
+benchmark_tile_sizes(string_view extension, TypeDesc datatype,
+                     int tilestart = 4)
+{
+    const int test_res = 4096;
+    std::vector<int> tile_sizes;
+    for (int ts = tilestart; ts <= test_res / 2; ts *= 2)
+        tile_sizes.push_back(ts);
+    ImageSpec test_image_spec(4096, 2048, 4, datatype);
+    ImageBuf buf(test_image_spec);
+    static float colors[4][4] = { { 0.1f, 0.1f, 0.1f, 1.0f },
+                                  { 1.0f, 0.0f, 0.0f, 1.0f },
+                                  { 0.0f, 1.0f, 0.0f, 1.0f },
+                                  { 0.0f, 0.0f, 1.0f, 1.0f } };
+    // ImageBufAlgo::fill(buf, make_cspan(colors[1], 4));
+    ImageBufAlgo::fill(buf, colors[0], colors[1], colors[2], colors[3]);
+    buf.write(Strutil::format("test.{}", extension));
+
+    Benchmarker bench;
+    bench.units(Benchmarker::Unit::ms);
+    bench.iterations(1);
+    bench.trials(5);
+    print("\nBenchmarking write/read for {} under different tile sizes\n",
+          extension);
+
+    // Write a scanline file
+    auto scanline_filename = Strutil::format("test_scanline.{}", extension);
+    bench(Strutil::format("  write {} scanline ", extension), [&]() {
+        checked_write(nullptr, scanline_filename, test_image_spec, datatype,
+                      buf.localpixels_as_byte_image_span());
+    });
+
+    // Write tiled files of different sizes
+    for (auto ts : tile_sizes) {
+        test_image_spec.tile_width  = ts;
+        test_image_spec.tile_height = ts;
+        test_image_spec.tile_depth  = ts ? 1 : 0;
+        auto filename = Strutil::format("test_tile_{:04}.{}", ts, extension);
+        bench(Strutil::format("  write {} tile {}", extension, ts), [&]() {
+            checked_write(nullptr, filename, test_image_spec, datatype,
+                          buf.localpixels_as_byte_image_span());
+        });
+    }
+
+    // read the scanline file (and delete it when we're done)
+    std::vector<unsigned char> readbuffer(test_image_spec.image_bytes());
+    bench(Strutil::format("  read {} scanline ", extension), [&]() {
+        checked_read(nullptr, scanline_filename, readbuffer, datatype);
+    });
+    Filesystem::remove(scanline_filename);
+
+    // read the tiled files of different sizes (and delete when done)
+    for (auto ts : tile_sizes) {
+        auto filename = Strutil::format("test_tile_{:04}.{}", ts, extension);
+        bench(Strutil::format("  read {} tile {}", extension, ts),
+              [&]() { checked_read(nullptr, filename, readbuffer, datatype); });
+        Filesystem::remove(filename);
+    }
+}
+
+
+
 int
 main(int argc, char* argv[])
 {
@@ -549,6 +621,8 @@ main(int argc, char* argv[])
 
     test_all_formats();
     test_read_tricky_sizes();
+    benchmark_tile_sizes("exr", TypeHalf, 4);
+    benchmark_tile_sizes("tif", TypeUInt16, 16);
 
     return unit_test_failures;
 }

From 6517f07694d14341211dcbf3421c022e014c9bee Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 21 Feb 2026 14:06:32 -0800
Subject: [PATCH 61/70] build: Remove support for deprecated Intel icc compiler
 (#5040)

Intel icc is deprecated and hasn't had a release for a few years. It's
holding us back, both by making us work around an ever growing number of
icc bugs and limitation that will never be fixed, as well as not
allowing us to upgrade minimum versions of certain dependencies, because
icc can't correctly compile newer versions (as an example, it cannot use
a 'fmt' library newer than the oldest we support, 7.0).

So it's time to thank icc for its service and put it on the ice floe for
the polar bears to eat. This is of course in main (future 3.2), and will
not be backported to release branches, since we never stop support of a
dependency or toolchain of existing releases. People requiring icc for
whatever reason may keep using OIIO 3.1 or older.

We will continue to support and test icx, the fully supported Intel
LLVM-based compiler.

---------

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml              |  49 +-
 CHANGES.md                            |   1 +
 INSTALL.md                            |   4 +-
 src/build-scripts/gh-installdeps.bash |   9 +-
 src/include/OpenImageIO/benchmark.h   |   2 +-
 src/include/OpenImageIO/bit.h         |  38 +-
 src/include/OpenImageIO/image_span.h  |   3 -
 src/include/OpenImageIO/platform.h    |  20 +-
 src/include/OpenImageIO/simd.h        |  60 --
 src/libOpenImageIO/formatspec.cpp     |   1 -
 src/libOpenImageIO/imageio.cpp        |   4 +-
 src/libutil/sysutil.cpp               |   2 -
 testsuite/tiff-depths/ref/out-icc.txt | 876 --------------------------
 13 files changed, 30 insertions(+), 1039 deletions(-)
 delete mode 100644 testsuite/tiff-depths/ref/out-icc.txt

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 00d8d1548e..bc24a1179b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -241,39 +241,6 @@ jobs:
             pybind11_ver: v2.10.0
             setenvs: export PUGIXML_VERSION=v1.13
             optional_deps_append: 'LibRaw;Ptex;Qt6'
-          - desc: VFX2023 icc/C++17 py3.10 exr3.1 ocio2.3 qt5.15
-            nametag: linux-vfx2023.icc
-            runner: ubuntu-latest
-            container: aswf/ci-osl:2023
-            opencolorio_ver: v2.3.0
-            python_ver: "3.10"
-            # simd: "avx2,f16c"
-            fmt_ver: 7.1.3
-            # icc MUST use this older FMT version
-            pybind11_ver: v2.9.0
-            setenvs: export USE_ICC=1 USE_OPENVDB=0 USE_OPENCV=0
-                            OIIO_EXTRA_CPP_ARGS="-fp-model=precise"
-                            FREETYPE_VERSION=VER-2-13-0
-                            DISABLE_libuhdr=1
-            # For icc, use fp-model precise to eliminate needless LSB errors
-            # that make test results differ from other platforms.
-            optional_deps_append: "LibRaw;Ptex;Qt6"
-          - desc: VFX2025 icx/C++17 py3.11 exr3.3 ocio2.4 qt5.15
-            nametag: linux-vfx2023.icx
-            runner: ubuntu-latest
-            container: aswf/ci-oiio:2025
-            cc_compiler: icx
-            cxx_compiler: icpx
-            fmt_ver: 11.2.0
-            python_ver: "3.11"
-            pybind11_ver: v2.13.6
-            simd: "avx2,f16c"
-            benchmark: 1
-            setenvs: export USE_OPENVDB=0 USE_OPENCV=0
-                            UHDR_CMAKE_C_COMPILER=gcc
-                            UHDR_CMAKE_CXX_COMPILER=g++
-            # Building libuhdr with icx results in test failures
-            optional_deps_append: "LibRaw;Ptex;openjph;Qt6"
           - desc: VFX2024 gcc11/C++17 py3.11 exr3.2 ocio2.3
             nametag: linux-vfx2024
             runner: ubuntu-latest
@@ -336,6 +303,22 @@ jobs:
           #   setenvs: export PUGIXML_VERSION=v1.15
           #                   BUILD_SHARED_LIBS=OFF
           #   optional_deps_append: "openjph;Qt6"
+          - desc: VFX2025 icx/C++17 py3.11 exr3.3 ocio2.4 qt5.15
+            nametag: linux-vfx2025.icx
+            runner: ubuntu-latest
+            container: aswf/ci-oiio:2025
+            cc_compiler: icx
+            cxx_compiler: icpx
+            fmt_ver: 11.2.0
+            python_ver: "3.11"
+            pybind11_ver: v2.13.6
+            simd: "avx2,f16c"
+            benchmark: 1
+            setenvs: export USE_OPENVDB=0 USE_OPENCV=0
+                            UHDR_CMAKE_C_COMPILER=gcc
+                            UHDR_CMAKE_CXX_COMPILER=g++
+            # Building libuhdr with icx results in test failures
+            optional_deps_append: "LibRaw;Ptex;openjph;Qt6"
           - desc: VFX2026 gcc14/C++20 py3.13 exr3.4 ocio2.4
             nametag: linux-vfx2026
             runner: ubuntu-latest
diff --git a/CHANGES.md b/CHANGES.md
index df7e647fea..8344cea2e9 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,6 +2,7 @@ Release 3.2 (target: Sept 2026?) -- compared to 3.1
 ---------------------------------------------------
 
 ### New minimum dependencies and compatibility changes:
+  - The deprecated icc compiler is no longer supported. (3.2.0.0)
 ### ⛰️  New features and public API changes:
 * *New image file format support:*
 * *oiiotool new features and major improvements*:
diff --git a/INSTALL.md b/INSTALL.md
index a52378c5ba..8e96afc2b1 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -17,8 +17,8 @@ NEW or CHANGED MINIMUM dependencies since the last major release are **bold**.
  * C++17 or higher (also builds with C++20 and C++23)
      * The default build mode is C++17. This can be controlled by via the
        CMake configuration flag: `-DCMAKE_CXX_STANDARD=20`, etc.
- * Compilers: gcc 9.3 - 14.2, **clang 10** - 20, MSVS 2017 - 2022 (v19.14
-   and up), Intel icc 19+, Intel OneAPI C++ compiler 2022+.
+ * Compilers: gcc 9.3 - 14.2, clang 10 - 20, MSVS 2017 - 2022 (v19.14
+   and up), Intel OneAPI C++ compiler 2022+.
  * CMake >= 3.18.2 (tested through 4.1)
  * Imath >= 3.1 (tested through 3.2 and main)
  * OpenEXR >= 3.1 (tested through 3.4 and main)
diff --git a/src/build-scripts/gh-installdeps.bash b/src/build-scripts/gh-installdeps.bash
index 0dabcf61e0..5ce3e0afee 100755
--- a/src/build-scripts/gh-installdeps.bash
+++ b/src/build-scripts/gh-installdeps.bash
@@ -69,14 +69,7 @@ if [[ "$ASWF_ORG" != ""  ]] ; then
         time pip3 install ${PIP_INSTALLS} || true
     fi
 
-    if [[ "$CXX" == "icpc" || "$CC" == "icc" || "$USE_ICC" != "" ]] ; then
-        # Lock down icc to 2022.1 because newer versions hosted on the Intel
-        # repo require a glibc too new for the ASWF CentOS7-based containers
-        # we run CI on.
-        sudo cp src/build-scripts/oneAPI.repo /etc/yum.repos.d
-        sudo /usr/bin/yum install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic-2022.1.0.x86_64
-        set +e; source /opt/intel/oneapi/setvars.sh --config oneapi_2022.1.0.cfg; set -e
-    elif [[ "$CXX" == "icpc" || "$CC" == "icc" || "$USE_ICC" != "" || "$CXX" == "icpx" || "$CC" == "icx" || "$USE_ICX" != "" ]] ; then
+    if [[ "$CXX" == "icpx" || "$CC" == "icx" || "$USE_ICX" != "" ]] ; then
         sudo cp src/build-scripts/oneAPI.repo /etc/yum.repos.d
         sudo yum install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic
         # If we needed to lock down to a particular version, we could:
diff --git a/src/include/OpenImageIO/benchmark.h b/src/include/OpenImageIO/benchmark.h
index bfcb8b8235..47727a7513 100644
--- a/src/include/OpenImageIO/benchmark.h
+++ b/src/include/OpenImageIO/benchmark.h
@@ -15,7 +15,7 @@
 #include <OpenImageIO/timer.h>
 
 
-#if (((OIIO_GNUC_VERSION && NDEBUG) || OIIO_CLANG_VERSION >= 30500 || OIIO_APPLE_CLANG_VERSION >= 70000 || defined(__INTEL_COMPILER)  || defined(__INTEL_LLVM_COMPILER)) \
+#if (((OIIO_GNUC_VERSION && NDEBUG) || OIIO_CLANG_VERSION >= 30500 || OIIO_APPLE_CLANG_VERSION >= 70000 || defined(__INTEL_LLVM_COMPILER)) \
       && (defined(__x86_64__) || defined(__i386__))) \
     || defined(_MSC_VER)
 #define OIIO_DONOTOPT_FORECINLINE OIIO_FORCEINLINE
diff --git a/src/include/OpenImageIO/bit.h b/src/include/OpenImageIO/bit.h
index 08526f3b85..520f35ba93 100644
--- a/src/include/OpenImageIO/bit.h
+++ b/src/include/OpenImageIO/bit.h
@@ -37,40 +37,6 @@ bitcast(const From& from) noexcept
     return result;
 }
 
-#if defined(__INTEL_COMPILER)
-// For Intel icc, using the memcpy implementation above will cause a loop with
-// a bitcast to fail to vectorize, but using the intrinsics below will allow
-// it to vectorize. For icx, as well as gcc and clang, the same optimal code
-// is generated (even in a vectorized loop) for memcpy. We can probably remove
-// these intrinsics once we drop support for icc.
-template<>
-OIIO_NODISCARD OIIO_FORCEINLINE uint32_t
-bitcast<uint32_t, float>(const float& val) noexcept
-{
-    return static_cast<uint32_t>(_castf32_u32(val));
-}
-
-template<>
-OIIO_NODISCARD OIIO_FORCEINLINE int32_t
-bitcast<int32_t, float>(const float& val) noexcept
-{
-    return static_cast<int32_t>(_castf32_u32(val));
-}
-
-template<>
-OIIO_NODISCARD OIIO_FORCEINLINE float
-bitcast<float, uint32_t>(const uint32_t& val) noexcept
-{
-    return _castu32_f32(val);
-}
-
-template<>
-OIIO_NODISCARD OIIO_FORCEINLINE float
-bitcast<float, int32_t>(const int32_t& val) noexcept
-{
-    return _castu32_f32(val);
-}
-#endif
 
 
 OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE int
@@ -112,9 +78,7 @@ byteswap(T n)
 
 
-#if (OIIO_GNUC_VERSION || OIIO_ANY_CLANG     \
-     || OIIO_INTEL_CLASSIC_COMPILER_VERSION) \
-    && !defined(__CUDACC__)
+#if (OIIO_GNUC_VERSION || OIIO_ANY_CLANG) && !defined(__CUDACC__)
 // CPU gcc and compatible can use these intrinsics, 8-15x faster
 
 template<>
diff --git a/src/include/OpenImageIO/image_span.h b/src/include/OpenImageIO/image_span.h
index 0ee7d2dcaf..3cea3215ee 100644
--- a/src/include/OpenImageIO/image_span.h
+++ b/src/include/OpenImageIO/image_span.h
@@ -292,9 +292,6 @@ template<typename T, size_t Rank = 4> class image_span {
             return (T*)((char*)data() + c * chanstride() + x * xstride()
                         + y * ystride() + z * zstride());
         }
-#ifdef __INTEL_COMPILER
-        return nullptr;  // should never get here, but icc is confused
-#endif
     }
 
     /// Return a pointer to the value at channel 0, pixel (x,y,z).
diff --git a/src/include/OpenImageIO/platform.h b/src/include/OpenImageIO/platform.h
index aeba989947..a2b134da4b 100644
--- a/src/include/OpenImageIO/platform.h
+++ b/src/include/OpenImageIO/platform.h
@@ -345,8 +345,6 @@
 #    define OIIO_ALIGN(size) __attribute__((aligned(size)))
 #elif defined(_MSC_VER)
 #    define OIIO_ALIGN(size) __declspec(align(size))
-#elif defined(__INTEL_COMPILER)
-#    define OIIO_ALIGN(size) __declspec(align((size)))
 #else
 #    define OIIO_ALIGN(size) alignas(size)
 #endif
@@ -370,7 +368,7 @@
 //     if (OIIO_UNLIKELY(x)) ...   // if you think x will rarely be true
 // Caveat: Programmers are notoriously bad at guessing this, so it
 // should be used only with thorough benchmarking.
-#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+#if defined(__GNUC__) || defined(__clang__)
 #    define OIIO_LIKELY(x) (__builtin_expect(bool(x), true))
 #    define OIIO_UNLIKELY(x) (__builtin_expect(bool(x), false))
 #else
@@ -387,7 +385,7 @@
 #    define OIIO_FORCEINLINE __inline__
 #elif defined(__GNUC__) || defined(__clang__) || __has_attribute(always_inline)
 #    define OIIO_FORCEINLINE inline __attribute__((always_inline))
-#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#elif defined(_MSC_VER)
 #    define OIIO_FORCEINLINE __forceinline
 #else
 #    define OIIO_FORCEINLINE inline
@@ -399,7 +397,7 @@
 // optimizations by knowing that calling the function cannot possibly alter
 // any other memory. This declaration goes after the function declaration:
 //   int blah (int arg) OIIO_PURE_FUNC;
-#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || __has_attribute(pure)
+#if defined(__GNUC__) || defined(__clang__) || __has_attribute(pure)
 #    define OIIO_PURE_FUNC __attribute__((pure))
 #elif defined(_MSC_VER)
 #    define OIIO_PURE_FUNC /* seems not supported by MSVS */
@@ -413,7 +411,7 @@
 // no side effects. This is even more strict than 'pure', and allows even
 // more optimizations (such as eliminating multiple calls to the function
 // that have the exact same argument values).
-#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) || __has_attribute(const)
+#if defined(__GNUC__) || defined(__clang__) || __has_attribute(const)
 #    define OIIO_CONST_FUNC __attribute__((const))
 #elif defined(_MSC_VER)
 #    define OIIO_CONST_FUNC /* seems not supported by MSVS */
@@ -430,7 +428,7 @@
 // OIIO_RESTRICT is a parameter attribute that indicates a promise that the
 // parameter definitely will not alias any other parameters in such a way
 // that creates a data dependency. Use with caution!
-#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 #  define OIIO_RESTRICT __restrict
 #else
 #  define OIIO_RESTRICT
@@ -474,7 +472,7 @@
 // false positives that you can't easily get rid of.
 // This should work for any clang >= 3.3 and gcc >= 4.8, which are
 // guaranteed by our minimum requirements.
-#if defined(__clang__) || (OIIO_GNUC_VERSION > 90000 && !defined(__INTEL_COMPILER)) \
+#if defined(__clang__) || OIIO_GNUC_VERSION > 90000 \
                        || __has_attribute(no_sanitize_address)
 #    define OIIO_NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
 #else
@@ -485,8 +483,7 @@
 // OIIO_NO_SANITIZE_UNDEFINED can be used to mark a function that you don't
 // want undefined behavior sanitizer to catch. Only use this if you know there
 // are false positives that you can't easily get rid of.
-#if defined(__clang__) || (OIIO_GNUC_VERSION > 90000 && !defined(__INTEL_COMPILER)) \
-                       || __has_attribute(no_sanitize)
+#if defined(__clang__) || OIIO_GNUC_VERSION > 90000 || __has_attribute(no_sanitize)
 #    define OIIO_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined")))
 #else
 #    define OIIO_NO_SANITIZE_UNDEFINED
@@ -628,10 +625,7 @@ template <typename T, class... Args>
 inline T* aligned_new(Args&&... args) {
     static_assert(alignof(T) > alignof(void*), "Type doesn't seem to be over-aligned, aligned_new is not required");
     void* ptr = aligned_malloc(sizeof(T), alignof(T));
-    OIIO_PRAGMA_WARNING_PUSH
-    OIIO_INTEL_PRAGMA(warning disable 873)
     return ptr ? new (ptr) T(std::forward<Args>(args)...) : nullptr;
-    OIIO_PRAGMA_WARNING_POP
 }
 
 template <typename T>
diff --git a/src/include/OpenImageIO/simd.h b/src/include/OpenImageIO/simd.h
index 9bb0b97093..37d8476729 100644
--- a/src/include/OpenImageIO/simd.h
+++ b/src/include/OpenImageIO/simd.h
@@ -1020,12 +1020,7 @@ class vint4 {
     vint4& operator=(int a) { load(a); return *this; }
 
     /// Assignment from another vint4
-#if !defined(__INTEL_COMPILER)
     vint4& operator=(const vint4& other) = default;
-#else
-    // For explanation of the necessity of this, see implementation comment.
-    vint4& operator=(const vint4& other);
-#endif
 
     /// Component access (get)
     int operator[] (int i) const;
@@ -1314,12 +1309,7 @@ class vint8 {
     vint8& operator=(int a) { load(a); return *this; }
 
     /// Assignment from another vint8
-#if !defined(__INTEL_COMPILER)
     vint8& operator=(const vint8& other) = default;
-#else
-    // For explanation of the necessity of this, see implementation comment.
-    vint8& operator=(const vint8& other);
-#endif
 
     /// Component access (get)
     int operator[] (int i) const;
@@ -1614,12 +1604,7 @@ class vint16 {
     vint16& operator=(int a) { load(a); return *this; }
 
     /// Assignment from another vint16
-#if !defined(__INTEL_COMPILER)
     vint16& operator=(const vint16& other) = default;
-#else
-    // For explanation of the necessity of this, see implementation comment.
-    vint16& operator=(const vint16& other);
-#endif
 
     /// Component access (get)
     int operator[] (int i) const;
@@ -4106,18 +4091,6 @@ OIIO_FORCEINLINE bool none (const vbool16& v) { return reduce_or(v) == false; }
 //////////////////////////////////////////////////////////////////////
 // vint4 implementation
 
-#if defined(__INTEL_COMPILER)
-// For reasons we don't understand, all sorts of failures crop up only on icc
-// if we make this =default. Although we still support icc for now, it's a
-// discontinued compiler, so we special-case it here rather than spend a lot
-// of time investigating what might be broken (and would of course never be
-// fixed if it's a compiler bug).
-OIIO_FORCEINLINE vint4& vint4::operator=(const vint4& other) {
-    m_simd = other.m_simd;
-    return *this;
-}
-#endif
-
 OIIO_FORCEINLINE int vint4::operator[] (int i) const {
     OIIO_DASSERT(i<elements);
     return m_val[i];
@@ -5004,18 +4977,6 @@ OIIO_FORCEINLINE vint4 safe_mod (const vint4& a, int b) {
 //////////////////////////////////////////////////////////////////////
 // vint8 implementation
 
-#if defined(__INTEL_COMPILER)
-// For reasons we don't understand, all sorts of failures crop up only on icc
-// if we make this =default. Although we still support icc for now, it's a
-// discontinued compiler, so we special-case it here rather than spend a lot
-// of time investigating what might be broken (and would of course never be
-// fixed if it's a compiler bug).
-OIIO_FORCEINLINE vint8& vint8::operator=(const vint8& other) {
-    m_simd = other.m_simd;
-    return *this;
-}
-#endif
-
 OIIO_FORCEINLINE int vint8::operator[] (int i) const {
     OIIO_DASSERT(i<elements);
     return m_val[i];
@@ -5834,18 +5795,6 @@ OIIO_FORCEINLINE vint8 safe_mod (const vint8& a, int b) {
 //////////////////////////////////////////////////////////////////////
 // vint16 implementation
 
-#if defined(__INTEL_COMPILER)
-// For reasons we don't understand, all sorts of failures crop up only on icc
-// if we make this =default. Although we still support icc for now, it's a
-// discontinued compiler, so we special-case it here rather than spend a lot
-// of time investigating what might be broken (and would of course never be
-// fixed if it's a compiler bug).
-OIIO_FORCEINLINE vint16& vint16::operator=(const vint16& other) {
-    m_simd = other.m_simd;
-    return *this;
-}
-#endif
-
 OIIO_FORCEINLINE int vint16::operator[] (int i) const {
     OIIO_DASSERT(i<elements);
     return m_val[i];
@@ -10307,15 +10256,6 @@ template<> struct fmt::formatter<OIIO::simd::matrix44>
     : OIIO::pvt::array_formatter<OIIO::simd::matrix44, float, 16> {};
 
 
-// Allow C++ metaprogramming to understand that the simd types are trivially
-// copyable (i.e. memcpy to copy simd types is fine).
-#if defined(__INTEL_COMPILER)
-// Necessary because we have to define the vint types copy constructors on icc
-template<> struct std::is_trivially_copyable<OIIO::simd::vint4> : std::true_type {};
-template<> struct std::is_trivially_copyable<OIIO::simd::vint8> : std::true_type {};
-template<> struct std::is_trivially_copyable<OIIO::simd::vint16> : std::true_type {};
-#endif
-
 
 #undef SIMD_DO
 #undef SIMD_CONSTRUCT
diff --git a/src/libOpenImageIO/formatspec.cpp b/src/libOpenImageIO/formatspec.cpp
index 81374b9624..efd9eff586 100644
--- a/src/libOpenImageIO/formatspec.cpp
+++ b/src/libOpenImageIO/formatspec.cpp
@@ -41,7 +41,6 @@ inline void
 get_default_quantize_(long long& quant_min, long long& quant_max) noexcept
 {
     OIIO_PRAGMA_WARNING_PUSH
-    OIIO_INTEL_PRAGMA(warning disable 173)
     if (std::numeric_limits<T>::is_integer) {
         quant_min = (long long)std::numeric_limits<T>::min();
         quant_max = (long long)std::numeric_limits<T>::max();
diff --git a/src/libOpenImageIO/imageio.cpp b/src/libOpenImageIO/imageio.cpp
index 4873c0f09e..f8b5afa7af 100644
--- a/src/libOpenImageIO/imageio.cpp
+++ b/src/libOpenImageIO/imageio.cpp
@@ -221,9 +221,7 @@ oiio_build_compiler()
     using Strutil::fmt::format;
 
     std::string comp;
-#if OIIO_INTEL_CLASSIC_COMPILER_VERSION
-    comp = format("Intel icc {}", OIIO_INTEL_CLASSIC_COMPILER_VERSION);
-#elif OIIO_INTEL_LLVM_COMPILER
+#if OIIO_INTEL_LLVM_COMPILER
     comp = format("Intel icx {}.{}", __clang_major__, __clang_minor__);
 #elif OIIO_APPLE_CLANG_VERSION
     comp = format("Apple clang {}.{}", __clang_major__, __clang_minor__);
diff --git a/src/libutil/sysutil.cpp b/src/libutil/sysutil.cpp
index 03e9127473..95762cd59d 100644
--- a/src/libutil/sysutil.cpp
+++ b/src/libutil/sysutil.cpp
@@ -73,8 +73,6 @@
 #    define HAVE_STACKTRACE 1
 #endif
 
-OIIO_INTEL_PRAGMA(warning disable 2196)
-
 
 OIIO_NAMESPACE_3_1_BEGIN
 
diff --git a/testsuite/tiff-depths/ref/out-icc.txt b/testsuite/tiff-depths/ref/out-icc.txt
deleted file mode 100644
index ae0e53bea4..0000000000
--- a/testsuite/tiff-depths/ref/out-icc.txt
+++ /dev/null
@@ -1,876 +0,0 @@
-Reading ../oiio-images/libtiffpic/depth/flower-minisblack-02.tif
-../oiio-images/libtiffpic/depth/flower-minisblack-02.tif :   73 x   43, 1 channel, uint2 tiff
-    SHA-1: F6BD9D10FB0DD8E9AC62DEBBB743A78FC48D3C9B
-    channel list: Y
-    compression: "none"
-    DocumentName: "flower-minisblack-02.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 2
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 1
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 431
-Comparing "../oiio-images/libtiffpic/depth/flower-minisblack-02.tif" and "flower-minisblack-02.tif"
-PASS
-flower-minisblack-02.tif :   73 x   43, 1 channel, uint2 tiff
-    SHA-1: F6BD9D10FB0DD8E9AC62DEBBB743A78FC48D3C9B
-../oiio-images/libtiffpic/depth/flower-minisblack-02.tif :   73 x   43, 1 channel, uint2 tiff
-    SHA-1: F6BD9D10FB0DD8E9AC62DEBBB743A78FC48D3C9B
-Reading ../oiio-images/libtiffpic/depth/flower-minisblack-04.tif
-../oiio-images/libtiffpic/depth/flower-minisblack-04.tif :   73 x   43, 1 channel, uint4 tiff
-    SHA-1: 8C0CF14B3B585F4B1F249C681BEDEA4CB63E3EDD
-    channel list: Y
-    compression: "none"
-    DocumentName: "flower-minisblack-04.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 4
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 1
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 221
-Comparing "../oiio-images/libtiffpic/depth/flower-minisblack-04.tif" and "flower-minisblack-04.tif"
-PASS
-flower-minisblack-04.tif :   73 x   43, 1 channel, uint4 tiff
-    SHA-1: 8C0CF14B3B585F4B1F249C681BEDEA4CB63E3EDD
-../oiio-images/libtiffpic/depth/flower-minisblack-04.tif :   73 x   43, 1 channel, uint4 tiff
-    SHA-1: 8C0CF14B3B585F4B1F249C681BEDEA4CB63E3EDD
-Reading ../oiio-images/libtiffpic/depth/flower-minisblack-06.tif
-../oiio-images/libtiffpic/depth/flower-minisblack-06.tif :   73 x   43, 1 channel, uint6 tiff
-    SHA-1: AE809BFEF36E3E0047343655231200A916D83492
-    channel list: Y
-    compression: "none"
-    DocumentName: "flower-minisblack-06.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 6
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 1
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 148
-Comparing "../oiio-images/libtiffpic/depth/flower-minisblack-06.tif" and "flower-minisblack-06.tif"
-PASS
-flower-minisblack-06.tif :   73 x   43, 1 channel, uint6 tiff
-    SHA-1: AE809BFEF36E3E0047343655231200A916D83492
-../oiio-images/libtiffpic/depth/flower-minisblack-06.tif :   73 x   43, 1 channel, uint6 tiff
-    SHA-1: AE809BFEF36E3E0047343655231200A916D83492
-Reading ../oiio-images/libtiffpic/depth/flower-minisblack-08.tif
-../oiio-images/libtiffpic/depth/flower-minisblack-08.tif :   73 x   43, 1 channel, uint8 tiff
-    SHA-1: 1A909C8E70CC479D8A35BAA9BFEDDCBF4BF46FDC
-    channel list: Y
-    compression: "none"
-    DocumentName: "flower-minisblack-08.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 8
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 1
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 112
-Comparing "../oiio-images/libtiffpic/depth/flower-minisblack-08.tif" and "flower-minisblack-08.tif"
-PASS
-flower-minisblack-08.tif :   73 x   43, 1 channel, uint8 tiff
-    SHA-1: 1A909C8E70CC479D8A35BAA9BFEDDCBF4BF46FDC
-../oiio-images/libtiffpic/depth/flower-minisblack-08.tif :   73 x   43, 1 channel, uint8 tiff
-    SHA-1: 1A909C8E70CC479D8A35BAA9BFEDDCBF4BF46FDC
-Reading ../oiio-images/libtiffpic/depth/flower-minisblack-10.tif
-../oiio-images/libtiffpic/depth/flower-minisblack-10.tif :   73 x   43, 1 channel, uint10 tiff
-    SHA-1: E9240FEF19CC8EF5EBBF2EE4A10EDF25E51C67B4
-    channel list: Y
-    compression: "none"
-    DocumentName: "flower-minisblack-10.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 10
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 1
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 89
-Comparing "../oiio-images/libtiffpic/depth/flower-minisblack-10.tif" and "flower-minisblack-10.tif"
-PASS
-flower-minisblack-10.tif :   73 x   43, 1 channel, uint10 tiff
-    SHA-1: E9240FEF19CC8EF5EBBF2EE4A10EDF25E51C67B4
-../oiio-images/libtiffpic/depth/flower-minisblack-10.tif :   73 x   43, 1 channel, uint10 tiff
-    SHA-1: E9240FEF19CC8EF5EBBF2EE4A10EDF25E51C67B4
-Reading ../oiio-images/libtiffpic/depth/flower-minisblack-12.tif
-../oiio-images/libtiffpic/depth/flower-minisblack-12.tif :   73 x   43, 1 channel, uint12 tiff
-    SHA-1: AAE977957ED6AAC647967192A74E4AD55FF75811
-    channel list: Y
-    compression: "none"
-    DocumentName: "flower-minisblack-12.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 12
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 1
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 74
-Comparing "../oiio-images/libtiffpic/depth/flower-minisblack-12.tif" and "flower-minisblack-12.tif"
-PASS
-flower-minisblack-12.tif :   73 x   43, 1 channel, uint12 tiff
-    SHA-1: AAE977957ED6AAC647967192A74E4AD55FF75811
-../oiio-images/libtiffpic/depth/flower-minisblack-12.tif :   73 x   43, 1 channel, uint12 tiff
-    SHA-1: AAE977957ED6AAC647967192A74E4AD55FF75811
-Reading ../oiio-images/libtiffpic/depth/flower-minisblack-14.tif
-../oiio-images/libtiffpic/depth/flower-minisblack-14.tif :   73 x   43, 1 channel, uint14 tiff
-    SHA-1: C1B9CA21C227EF11626EF0C58BD49769EEF48363
-    channel list: Y
-    compression: "none"
-    DocumentName: "flower-minisblack-14.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 14
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 1
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 64
-Comparing "../oiio-images/libtiffpic/depth/flower-minisblack-14.tif" and "flower-minisblack-14.tif"
-PASS
-flower-minisblack-14.tif :   73 x   43, 1 channel, uint14 tiff
-    SHA-1: C1B9CA21C227EF11626EF0C58BD49769EEF48363
-../oiio-images/libtiffpic/depth/flower-minisblack-14.tif :   73 x   43, 1 channel, uint14 tiff
-    SHA-1: C1B9CA21C227EF11626EF0C58BD49769EEF48363
-Reading ../oiio-images/libtiffpic/depth/flower-minisblack-16.tif
-../oiio-images/libtiffpic/depth/flower-minisblack-16.tif :   73 x   43, 1 channel, uint16 tiff
-    SHA-1: 7EBB74E46C869CA0D6D091183732214B6A75173A
-    channel list: Y
-    compression: "none"
-    DocumentName: "flower-minisblack-16.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 16
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 1
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 56
-Comparing "../oiio-images/libtiffpic/depth/flower-minisblack-16.tif" and "flower-minisblack-16.tif"
-PASS
-flower-minisblack-16.tif :   73 x   43, 1 channel, uint16 tiff
-    SHA-1: 7EBB74E46C869CA0D6D091183732214B6A75173A
-../oiio-images/libtiffpic/depth/flower-minisblack-16.tif :   73 x   43, 1 channel, uint16 tiff
-    SHA-1: 7EBB74E46C869CA0D6D091183732214B6A75173A
-Reading ../oiio-images/libtiffpic/depth/flower-minisblack-24.tif
-../oiio-images/libtiffpic/depth/flower-minisblack-24.tif :   73 x   43, 1 channel, uint24 tiff
-    SHA-1: BBFA6633ECF3FF686DB36F6DD00F8A359D2B1DAF
-    channel list: Y
-    compression: "none"
-    DocumentName: "flower-minisblack-24.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q8 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 24
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 1
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 37
-Comparing "../oiio-images/libtiffpic/depth/flower-minisblack-24.tif" and "flower-minisblack-24.tif"
-PASS
-flower-minisblack-24.tif :   73 x   43, 1 channel, uint24 tiff
-    SHA-1: BBFA6633ECF3FF686DB36F6DD00F8A359D2B1DAF
-../oiio-images/libtiffpic/depth/flower-minisblack-24.tif :   73 x   43, 1 channel, uint24 tiff
-    SHA-1: BBFA6633ECF3FF686DB36F6DD00F8A359D2B1DAF
-Reading ../oiio-images/libtiffpic/depth/flower-minisblack-32.tif
-../oiio-images/libtiffpic/depth/flower-minisblack-32.tif :   73 x   43, 1 channel, uint tiff
-    SHA-1: C98FB1125C7210E380E3F86DFCAEFF49A16742E0
-    channel list: Y
-    compression: "none"
-    DocumentName: "flower-minisblack-32.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 32
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 1
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 28
-Comparing "../oiio-images/libtiffpic/depth/flower-minisblack-32.tif" and "flower-minisblack-32.tif"
-PASS
-flower-minisblack-32.tif :   73 x   43, 1 channel, uint tiff
-    SHA-1: C98FB1125C7210E380E3F86DFCAEFF49A16742E0
-../oiio-images/libtiffpic/depth/flower-minisblack-32.tif :   73 x   43, 1 channel, uint tiff
-    SHA-1: C98FB1125C7210E380E3F86DFCAEFF49A16742E0
-Reading ../oiio-images/libtiffpic/depth/flower-palette-02.tif
-../oiio-images/libtiffpic/depth/flower-palette-02.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 52B3033465AA01129BAE149FF96CBB49877DAB7C
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-palette-02.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 8
-    tiff:BitsPerSample: 2
-    tiff:ColorSpace: "palette"
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 3
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 431
-Comparing "../oiio-images/libtiffpic/depth/flower-palette-02.tif" and "flower-palette-02.tif"
-PASS
-flower-palette-02.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 52B3033465AA01129BAE149FF96CBB49877DAB7C
-../oiio-images/libtiffpic/depth/flower-palette-02.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 52B3033465AA01129BAE149FF96CBB49877DAB7C
-Reading ../oiio-images/libtiffpic/depth/flower-palette-04.tif
-../oiio-images/libtiffpic/depth/flower-palette-04.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: C6E40A3D134F1A29E153FE15459D8DE657CB7F9C
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-palette-04.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 8
-    tiff:BitsPerSample: 4
-    tiff:ColorSpace: "palette"
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 3
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 221
-Comparing "../oiio-images/libtiffpic/depth/flower-palette-04.tif" and "flower-palette-04.tif"
-PASS
-flower-palette-04.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: C6E40A3D134F1A29E153FE15459D8DE657CB7F9C
-../oiio-images/libtiffpic/depth/flower-palette-04.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: C6E40A3D134F1A29E153FE15459D8DE657CB7F9C
-Reading ../oiio-images/libtiffpic/depth/flower-palette-08.tif
-../oiio-images/libtiffpic/depth/flower-palette-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 0ADA355BABFE9866F3D88AF7CA3AAC69D7DC036D
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-palette-08.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 8
-    tiff:ColorSpace: "palette"
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 3
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 112
-Comparing "../oiio-images/libtiffpic/depth/flower-palette-08.tif" and "flower-palette-08.tif"
-PASS
-flower-palette-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 0ADA355BABFE9866F3D88AF7CA3AAC69D7DC036D
-../oiio-images/libtiffpic/depth/flower-palette-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 0ADA355BABFE9866F3D88AF7CA3AAC69D7DC036D
-Reading ../oiio-images/libtiffpic/depth/flower-palette-16.tif
-../oiio-images/libtiffpic/depth/flower-palette-16.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 543285C6812105A1DA3B8ADA691D5DA3AE89B10D
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-palette-16.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 16
-    tiff:ColorSpace: "palette"
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 3
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 56
-Comparing "../oiio-images/libtiffpic/depth/flower-palette-16.tif" and "flower-palette-16.tif"
-PASS
-flower-palette-16.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 543285C6812105A1DA3B8ADA691D5DA3AE89B10D
-../oiio-images/libtiffpic/depth/flower-palette-16.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 543285C6812105A1DA3B8ADA691D5DA3AE89B10D
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-contig-02.tif
-../oiio-images/libtiffpic/depth/flower-rgb-contig-02.tif :   73 x   43, 3 channel, uint2 tiff
-    SHA-1: D68490C8E508DEBECEE4DF3A9E5DA0523CD5C302
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-contig-02.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 2
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 148
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-contig-02.tif" and "flower-rgb-contig-02.tif"
-PASS
-flower-rgb-contig-02.tif :   73 x   43, 3 channel, uint2 tiff
-    SHA-1: D68490C8E508DEBECEE4DF3A9E5DA0523CD5C302
-../oiio-images/libtiffpic/depth/flower-rgb-contig-02.tif :   73 x   43, 3 channel, uint2 tiff
-    SHA-1: D68490C8E508DEBECEE4DF3A9E5DA0523CD5C302
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-contig-04.tif
-../oiio-images/libtiffpic/depth/flower-rgb-contig-04.tif :   73 x   43, 3 channel, uint4 tiff
-    SHA-1: A5920C9D08B9E25C96D55FDF72F46F589AE7643D
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-contig-04.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 4
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 74
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-contig-04.tif" and "flower-rgb-contig-04.tif"
-PASS
-flower-rgb-contig-04.tif :   73 x   43, 3 channel, uint4 tiff
-    SHA-1: A5920C9D08B9E25C96D55FDF72F46F589AE7643D
-../oiio-images/libtiffpic/depth/flower-rgb-contig-04.tif :   73 x   43, 3 channel, uint4 tiff
-    SHA-1: A5920C9D08B9E25C96D55FDF72F46F589AE7643D
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-contig-08.tif
-../oiio-images/libtiffpic/depth/flower-rgb-contig-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 35D01526DE5F904B7978B8EA16192A28389E276F
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-contig-08.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 8
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 37
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-contig-08.tif" and "flower-rgb-contig-08.tif"
-PASS
-flower-rgb-contig-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 35D01526DE5F904B7978B8EA16192A28389E276F
-../oiio-images/libtiffpic/depth/flower-rgb-contig-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 35D01526DE5F904B7978B8EA16192A28389E276F
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-contig-10.tif
-../oiio-images/libtiffpic/depth/flower-rgb-contig-10.tif :   73 x   43, 3 channel, uint10 tiff
-    SHA-1: 0C41DF861699CF536C581721EF17B01D1EFB5D86
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-contig-10.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 10
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 29
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-contig-10.tif" and "flower-rgb-contig-10.tif"
-PASS
-flower-rgb-contig-10.tif :   73 x   43, 3 channel, uint10 tiff
-    SHA-1: 0C41DF861699CF536C581721EF17B01D1EFB5D86
-../oiio-images/libtiffpic/depth/flower-rgb-contig-10.tif :   73 x   43, 3 channel, uint10 tiff
-    SHA-1: 0C41DF861699CF536C581721EF17B01D1EFB5D86
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-contig-12.tif
-../oiio-images/libtiffpic/depth/flower-rgb-contig-12.tif :   73 x   43, 3 channel, uint12 tiff
-    SHA-1: E61083B50548C7D304A45735452FD05C1814677B
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-contig-12.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 12
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 24
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-contig-12.tif" and "flower-rgb-contig-12.tif"
-PASS
-flower-rgb-contig-12.tif :   73 x   43, 3 channel, uint12 tiff
-    SHA-1: E61083B50548C7D304A45735452FD05C1814677B
-../oiio-images/libtiffpic/depth/flower-rgb-contig-12.tif :   73 x   43, 3 channel, uint12 tiff
-    SHA-1: E61083B50548C7D304A45735452FD05C1814677B
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-contig-14.tif
-../oiio-images/libtiffpic/depth/flower-rgb-contig-14.tif :   73 x   43, 3 channel, uint14 tiff
-    SHA-1: DD060DA62BB8F5903C5087796B2D05A682BE8ADA
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-contig-14.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 14
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 21
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-contig-14.tif" and "flower-rgb-contig-14.tif"
-PASS
-flower-rgb-contig-14.tif :   73 x   43, 3 channel, uint14 tiff
-    SHA-1: DD060DA62BB8F5903C5087796B2D05A682BE8ADA
-../oiio-images/libtiffpic/depth/flower-rgb-contig-14.tif :   73 x   43, 3 channel, uint14 tiff
-    SHA-1: DD060DA62BB8F5903C5087796B2D05A682BE8ADA
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-contig-16.tif
-../oiio-images/libtiffpic/depth/flower-rgb-contig-16.tif :   73 x   43, 3 channel, uint16 tiff
-    SHA-1: 19F69706D5C52FC9510A3C20F9A43361FEF2AC9D
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-contig-16.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 16
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 18
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-contig-16.tif" and "flower-rgb-contig-16.tif"
-PASS
-flower-rgb-contig-16.tif :   73 x   43, 3 channel, uint16 tiff
-    SHA-1: 19F69706D5C52FC9510A3C20F9A43361FEF2AC9D
-../oiio-images/libtiffpic/depth/flower-rgb-contig-16.tif :   73 x   43, 3 channel, uint16 tiff
-    SHA-1: 19F69706D5C52FC9510A3C20F9A43361FEF2AC9D
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-contig-24.tif
-../oiio-images/libtiffpic/depth/flower-rgb-contig-24.tif :   73 x   43, 3 channel, uint24 tiff
-    SHA-1: 6234B3CE28DFDF0FE6B1BCC29F62393696AF79A5
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-contig-24.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q8 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 24
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 12
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-contig-24.tif" and "flower-rgb-contig-24.tif"
-PASS
-flower-rgb-contig-24.tif :   73 x   43, 3 channel, uint24 tiff
-    SHA-1: 6234B3CE28DFDF0FE6B1BCC29F62393696AF79A5
-../oiio-images/libtiffpic/depth/flower-rgb-contig-24.tif :   73 x   43, 3 channel, uint24 tiff
-    SHA-1: 6234B3CE28DFDF0FE6B1BCC29F62393696AF79A5
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-contig-32.tif
-../oiio-images/libtiffpic/depth/flower-rgb-contig-32.tif :   73 x   43, 3 channel, uint tiff
-    SHA-1: 04DAF56E34180687DB7FA12E7EE8EC3A3E40DAB8
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-contig-32.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 32
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 9
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-contig-32.tif" and "flower-rgb-contig-32.tif"
-PASS
-flower-rgb-contig-32.tif :   73 x   43, 3 channel, uint tiff
-    SHA-1: 04DAF56E34180687DB7FA12E7EE8EC3A3E40DAB8
-../oiio-images/libtiffpic/depth/flower-rgb-contig-32.tif :   73 x   43, 3 channel, uint tiff
-    SHA-1: 04DAF56E34180687DB7FA12E7EE8EC3A3E40DAB8
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-planar-02.tif
-../oiio-images/libtiffpic/depth/flower-rgb-planar-02.tif :   73 x   43, 3 channel, uint2 tiff
-    SHA-1: D68490C8E508DEBECEE4DF3A9E5DA0523CD5C302
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-planar-02.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 2
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 431
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-planar-02.tif" and "flower-rgb-planar-02.tif"
-PASS
-flower-rgb-planar-02.tif :   73 x   43, 3 channel, uint2 tiff
-    SHA-1: D68490C8E508DEBECEE4DF3A9E5DA0523CD5C302
-../oiio-images/libtiffpic/depth/flower-rgb-planar-02.tif :   73 x   43, 3 channel, uint2 tiff
-    SHA-1: D68490C8E508DEBECEE4DF3A9E5DA0523CD5C302
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-planar-04.tif
-../oiio-images/libtiffpic/depth/flower-rgb-planar-04.tif :   73 x   43, 3 channel, uint4 tiff
-    SHA-1: A5920C9D08B9E25C96D55FDF72F46F589AE7643D
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-planar-04.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 4
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 221
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-planar-04.tif" and "flower-rgb-planar-04.tif"
-PASS
-flower-rgb-planar-04.tif :   73 x   43, 3 channel, uint4 tiff
-    SHA-1: A5920C9D08B9E25C96D55FDF72F46F589AE7643D
-../oiio-images/libtiffpic/depth/flower-rgb-planar-04.tif :   73 x   43, 3 channel, uint4 tiff
-    SHA-1: A5920C9D08B9E25C96D55FDF72F46F589AE7643D
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-planar-08.tif
-../oiio-images/libtiffpic/depth/flower-rgb-planar-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 35D01526DE5F904B7978B8EA16192A28389E276F
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-planar-08.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 8
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 112
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-planar-08.tif" and "flower-rgb-planar-08.tif"
-PASS
-flower-rgb-planar-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 35D01526DE5F904B7978B8EA16192A28389E276F
-../oiio-images/libtiffpic/depth/flower-rgb-planar-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: 35D01526DE5F904B7978B8EA16192A28389E276F
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-planar-10.tif
-../oiio-images/libtiffpic/depth/flower-rgb-planar-10.tif :   73 x   43, 3 channel, uint10 tiff
-    SHA-1: 0C41DF861699CF536C581721EF17B01D1EFB5D86
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-planar-10.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 10
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 89
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-planar-10.tif" and "flower-rgb-planar-10.tif"
-PASS
-flower-rgb-planar-10.tif :   73 x   43, 3 channel, uint10 tiff
-    SHA-1: 0C41DF861699CF536C581721EF17B01D1EFB5D86
-../oiio-images/libtiffpic/depth/flower-rgb-planar-10.tif :   73 x   43, 3 channel, uint10 tiff
-    SHA-1: 0C41DF861699CF536C581721EF17B01D1EFB5D86
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-planar-12.tif
-../oiio-images/libtiffpic/depth/flower-rgb-planar-12.tif :   73 x   43, 3 channel, uint12 tiff
-    SHA-1: E61083B50548C7D304A45735452FD05C1814677B
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-planar-12.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 12
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 74
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-planar-12.tif" and "flower-rgb-planar-12.tif"
-PASS
-flower-rgb-planar-12.tif :   73 x   43, 3 channel, uint12 tiff
-    SHA-1: E61083B50548C7D304A45735452FD05C1814677B
-../oiio-images/libtiffpic/depth/flower-rgb-planar-12.tif :   73 x   43, 3 channel, uint12 tiff
-    SHA-1: E61083B50548C7D304A45735452FD05C1814677B
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-planar-14.tif
-../oiio-images/libtiffpic/depth/flower-rgb-planar-14.tif :   73 x   43, 3 channel, uint14 tiff
-    SHA-1: DD060DA62BB8F5903C5087796B2D05A682BE8ADA
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-planar-14.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 14
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 64
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-planar-14.tif" and "flower-rgb-planar-14.tif"
-PASS
-flower-rgb-planar-14.tif :   73 x   43, 3 channel, uint14 tiff
-    SHA-1: DD060DA62BB8F5903C5087796B2D05A682BE8ADA
-../oiio-images/libtiffpic/depth/flower-rgb-planar-14.tif :   73 x   43, 3 channel, uint14 tiff
-    SHA-1: DD060DA62BB8F5903C5087796B2D05A682BE8ADA
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-planar-16.tif
-../oiio-images/libtiffpic/depth/flower-rgb-planar-16.tif :   73 x   43, 3 channel, uint16 tiff
-    SHA-1: 19F69706D5C52FC9510A3C20F9A43361FEF2AC9D
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-planar-16.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 16
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 56
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-planar-16.tif" and "flower-rgb-planar-16.tif"
-PASS
-flower-rgb-planar-16.tif :   73 x   43, 3 channel, uint16 tiff
-    SHA-1: 19F69706D5C52FC9510A3C20F9A43361FEF2AC9D
-../oiio-images/libtiffpic/depth/flower-rgb-planar-16.tif :   73 x   43, 3 channel, uint16 tiff
-    SHA-1: 19F69706D5C52FC9510A3C20F9A43361FEF2AC9D
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-planar-24.tif
-../oiio-images/libtiffpic/depth/flower-rgb-planar-24.tif :   73 x   43, 3 channel, uint24 tiff
-    SHA-1: 6234B3CE28DFDF0FE6B1BCC29F62393696AF79A5
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-planar-24.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q8 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 24
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 37
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-planar-24.tif" and "flower-rgb-planar-24.tif"
-PASS
-flower-rgb-planar-24.tif :   73 x   43, 3 channel, uint24 tiff
-    SHA-1: 6234B3CE28DFDF0FE6B1BCC29F62393696AF79A5
-../oiio-images/libtiffpic/depth/flower-rgb-planar-24.tif :   73 x   43, 3 channel, uint24 tiff
-    SHA-1: 6234B3CE28DFDF0FE6B1BCC29F62393696AF79A5
-Reading ../oiio-images/libtiffpic/depth/flower-rgb-planar-32.tif
-../oiio-images/libtiffpic/depth/flower-rgb-planar-32.tif :   73 x   43, 3 channel, uint tiff
-    SHA-1: 04DAF56E34180687DB7FA12E7EE8EC3A3E40DAB8
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-rgb-planar-32.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 32
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 2
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 28
-Comparing "../oiio-images/libtiffpic/depth/flower-rgb-planar-32.tif" and "flower-rgb-planar-32.tif"
-PASS
-flower-rgb-planar-32.tif :   73 x   43, 3 channel, uint tiff
-    SHA-1: 04DAF56E34180687DB7FA12E7EE8EC3A3E40DAB8
-../oiio-images/libtiffpic/depth/flower-rgb-planar-32.tif :   73 x   43, 3 channel, uint tiff
-    SHA-1: 04DAF56E34180687DB7FA12E7EE8EC3A3E40DAB8
-Reading ../oiio-images/libtiffpic/depth/flower-separated-contig-08.tif
-../oiio-images/libtiffpic/depth/flower-separated-contig-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: F739D368D37AB99D237FA1358A2EECE913245226
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-separated-contig-08.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 8
-    tiff:ColorSpace: "CMYK"
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 5
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 28
-Comparing "../oiio-images/libtiffpic/depth/flower-separated-contig-08.tif" and "flower-separated-contig-08.tif"
-PASS
-flower-separated-contig-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: F739D368D37AB99D237FA1358A2EECE913245226
-../oiio-images/libtiffpic/depth/flower-separated-contig-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: F739D368D37AB99D237FA1358A2EECE913245226
-Reading ../oiio-images/libtiffpic/depth/flower-separated-contig-16.tif
-../oiio-images/libtiffpic/depth/flower-separated-contig-16.tif :   73 x   43, 3 channel, uint16 tiff
-    SHA-1: BBAA06ABCADF65F9323FDA979421A54F5B2E53D0
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-separated-contig-16.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "contig"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 16
-    tiff:ColorSpace: "CMYK"
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 5
-    tiff:PlanarConfiguration: 1
-    tiff:RowsPerStrip: 14
-Comparing "../oiio-images/libtiffpic/depth/flower-separated-contig-16.tif" and "flower-separated-contig-16.tif"
-PASS
-flower-separated-contig-16.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: E55335D12E9A20EFB0A5EAE80F1801DF5A9BEE12
-../oiio-images/libtiffpic/depth/flower-separated-contig-16.tif :   73 x   43, 3 channel, uint16 tiff
-    SHA-1: BBAA06ABCADF65F9323FDA979421A54F5B2E53D0
-Reading ../oiio-images/libtiffpic/depth/flower-separated-planar-08.tif
-../oiio-images/libtiffpic/depth/flower-separated-planar-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: F739D368D37AB99D237FA1358A2EECE913245226
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-separated-planar-08.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 8
-    tiff:ColorSpace: "CMYK"
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 5
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 112
-Comparing "../oiio-images/libtiffpic/depth/flower-separated-planar-08.tif" and "flower-separated-planar-08.tif"
-PASS
-flower-separated-planar-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: F739D368D37AB99D237FA1358A2EECE913245226
-../oiio-images/libtiffpic/depth/flower-separated-planar-08.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: F739D368D37AB99D237FA1358A2EECE913245226
-Reading ../oiio-images/libtiffpic/depth/flower-separated-planar-16.tif
-../oiio-images/libtiffpic/depth/flower-separated-planar-16.tif :   73 x   43, 3 channel, uint16 tiff
-    SHA-1: BBAA06ABCADF65F9323FDA979421A54F5B2E53D0
-    channel list: R, G, B
-    compression: "none"
-    DocumentName: "flower-separated-planar-16.tif"
-    Orientation: 1 (normal)
-    PixelAspectRatio: 1
-    planarconfig: "separate"
-    ResolutionUnit: "in"
-    Software: "GraphicsMagick 1.2 unreleased Q32 http://www.GraphicsMagick.org/"
-    XResolution: 72
-    YResolution: 72
-    oiio:BitsPerSample: 16
-    tiff:ColorSpace: "CMYK"
-    tiff:Compression: 1
-    tiff:PhotometricInterpretation: 5
-    tiff:PlanarConfiguration: 2
-    tiff:RowsPerStrip: 56
-Comparing "../oiio-images/libtiffpic/depth/flower-separated-planar-16.tif" and "flower-separated-planar-16.tif"
-PASS
-flower-separated-planar-16.tif :   73 x   43, 3 channel, uint8 tiff
-    SHA-1: E55335D12E9A20EFB0A5EAE80F1801DF5A9BEE12
-../oiio-images/libtiffpic/depth/flower-separated-planar-16.tif :   73 x   43, 3 channel, uint16 tiff
-    SHA-1: BBAA06ABCADF65F9323FDA979421A54F5B2E53D0
-Comparing "cmyk_as_cmyk.tif" and "ref/cmyk_as_cmyk.tif"
-PASS

From 98c83de5681bcbcaee3e3d200e3d612134d26b57 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 21 Feb 2026 16:31:45 -0800
Subject: [PATCH 62/70] build(deps): Raise minimum fmt library version to 9.0
 (#5041)

The previous minimum, 7.0, dated from mid-2020.

We are raising now (in main / future 3.2 only) to 9.0, which dates from
mid-2022, so we're still supporting several versions and/or years back.

Because this changes minimum dependency versions, it will NOT be
backported to release branches (3.1 or earlier).

I had to remove the CI test variant for icc, because ancient icc can't
correctly build newer versions of fmt, it seems. There is a separate PR
to simply drop icc from our list of supported compilers.

If anybody wants to argue for pulling the minimum up even farther (say,
to fmt 10.0, released in 2023, so still supporting 3 years back), which
would simplify even more places, I would consider it.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 .github/workflows/ci.yml             |  8 ++++----
 CHANGES.md                           |  2 ++
 INSTALL.md                           |  6 +++---
 src/cmake/externalpackages.cmake     |  4 ++--
 src/include/OpenImageIO/detail/fmt.h | 21 +++++----------------
 src/include/OpenImageIO/strutil.h    | 23 +----------------------
 src/include/OpenImageIO/typedesc.h   |  2 +-
 src/include/OpenImageIO/ustring.h    |  6 ++----
 src/libutil/typedesc_test.cpp        |  5 +----
 src/libutil/ustring_test.cpp         |  4 +---
 10 files changed, 22 insertions(+), 59 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bc24a1179b..94a95efbf7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -91,7 +91,7 @@ jobs:
             cxx_std: 17
             python_ver: 3.9
             simd: "avx2,f16c"
-            fmt_ver: 8.1.1
+            fmt_ver: 9.0.0
             opencolorio_ver: v2.3.0
             pybind11_ver: v2.9.0
             setenvs: export FREETYPE_VERSION=VER-2-12-0
@@ -123,7 +123,7 @@ jobs:
             vfxyear: 2022
             old_node: 1
             cxx_std: 17
-            fmt_ver: 7.0.1
+            fmt_ver: 9.0.0
             opencolorio_ver: v2.3.0
             openexr_ver: v3.1.0
             pybind11_ver: v2.7.0
@@ -145,7 +145,7 @@ jobs:
             cc_compiler: clang
             cxx_compiler: clang++
             cxx_std: 17
-            fmt_ver: 7.0.1
+            fmt_ver: 9.0.0
             opencolorio_ver: v2.3.0
             openexr_ver: v3.1.0
             pybind11_ver: v2.7.0
@@ -167,7 +167,7 @@ jobs:
             vfxyear: 2022
             old_node: 1
             cxx_std: 17
-            fmt_ver: 7.0.1
+            fmt_ver: 9.0.0
             opencolorio_ver: v2.3.0
             openexr_ver: v3.1.0
             pybind11_ver: v2.7.0
diff --git a/CHANGES.md b/CHANGES.md
index 8344cea2e9..01302429ba 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -3,6 +3,8 @@ Release 3.2 (target: Sept 2026?) -- compared to 3.1
 
 ### New minimum dependencies and compatibility changes:
   - The deprecated icc compiler is no longer supported. (3.2.0.0)
+  - **fmt**: Minimum required version is now 9.0 (was 7.0).
+
 ### ⛰️  New features and public API changes:
 * *New image file format support:*
 * *oiiotool new features and major improvements*:
diff --git a/INSTALL.md b/INSTALL.md
index 8e96afc2b1..8d662290f1 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -23,11 +23,11 @@ NEW or CHANGED MINIMUM dependencies since the last major release are **bold**.
  * Imath >= 3.1 (tested through 3.2 and main)
  * OpenEXR >= 3.1 (tested through 3.4 and main)
  * libTIFF >= 4.0 (tested through 4.7 and master)
- * *OpenColorIO >= 2.3* (tested through 2.5 and main)
+ * OpenColorIO >= 2.3 (tested through 2.5 and main)
  * libjpeg >= 8 (tested through jpeg9e), or libjpeg-turbo >= 2.1 (tested
    through 3.1)
  * zlib >= 1.2.7 (tested through 1.3.1)
- * [fmtlib](https://github.com/fmtlib/fmt) >= 7.0 (tested through 12.0 and master).
+ * **[fmtlib](https://github.com/fmtlib/fmt) >= 9.0** (tested through 12.1 and master).
    If not found at build time, this will be automatically downloaded and built.
  * [Robin-map](https://github.com/Tessil/robin-map) (unknown minimum, tested
    through 1.4, which is the recommended version). If not found at build time,
@@ -39,7 +39,7 @@ NEW or CHANGED MINIMUM dependencies since the last major release are **bold**.
      * Qt5 >= 5.6 (tested through 5.15) or Qt6 (tested through 6.9)
      * OpenGL
  * If you are building the Python bindings or running the testsuite:
-     * **Python >= 3.9** (tested through 3.13).
+     * Python >= 3.9 (tested through 3.13).
      * pybind11 >= 2.7 (tested through 3.0)
      * NumPy (tested through 2.2.4)
  * If you want support for PNG files:
diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake
index c9b729d736..623bc1ca5c 100644
--- a/src/cmake/externalpackages.cmake
+++ b/src/cmake/externalpackages.cmake
@@ -236,9 +236,9 @@ checked_find_package (Robinmap REQUIRED
                      )
 
 # fmtlib
-option (OIIO_INTERNALIZE_FMT "Copy fmt headers into <install>/include/OpenImageIO/detail/fmt" ON)
+set_option (OIIO_INTERNALIZE_FMT "Copy fmt headers into <install>/include/OpenImageIO/detail/fmt" ON)
 checked_find_package (fmt REQUIRED
-                      VERSION_MIN 7.0
+                      VERSION_MIN 9.0
                       BUILD_LOCAL missing
                      )
 get_target_property(FMT_INCLUDE_DIR fmt::fmt-header-only INTERFACE_INCLUDE_DIRECTORIES)
diff --git a/src/include/OpenImageIO/detail/fmt.h b/src/include/OpenImageIO/detail/fmt.h
index b68ea14e02..3558152ae7 100644
--- a/src/include/OpenImageIO/detail/fmt.h
+++ b/src/include/OpenImageIO/detail/fmt.h
@@ -70,12 +70,9 @@ OIIO_PRAGMA_WARNING_PUSH
 
 OIIO_PRAGMA_WARNING_POP
 
-// At some point a method signature changed
-#if FMT_VERSION >= 90000
-#    define OIIO_FMT_CUSTOM_FORMATTER_CONST const
-#else
-#    define OIIO_FMT_CUSTOM_FORMATTER_CONST
-#endif
+// DEPRECATED(3.2): This definition is obsolete and should be removed at the
+// next ABI compatibility boundary.
+#define OIIO_FMT_CUSTOM_FORMATTER_CONST const
 
 
 OIIO_NAMESPACE_3_1_BEGIN
@@ -132,18 +129,14 @@ template<typename T,
 struct index_formatter : format_parser_with_separator {
     // inherits parse() from format_parser_with_separator
     template<typename FormatContext>
-    auto format(const T& v, FormatContext& ctx) OIIO_FMT_CUSTOM_FORMATTER_CONST
+    auto format(const T& v, FormatContext& ctx) const
     {
         std::string vspec = elem_fmt.size() ? fmt::format("{{:{}}}", elem_fmt)
                                             : std::string("{}");
         for (size_t i = 0; i < size_t(v.size()); ++i) {
             if (i)
                 fmt::format_to(ctx.out(), "{}", sep == ',' ? ", " : " ");
-#if FMT_VERSION >= 80000
             fmt::format_to(ctx.out(), fmt::runtime(vspec), v[i]);
-#else
-            fmt::format_to(ctx.out(), vspec, v[i]);
-#endif
         }
         return ctx.out();
     }
@@ -177,19 +170,15 @@ template<typename T, typename Elem, int Size>
 struct array_formatter : format_parser_with_separator {
     // inherits parse() from format_parser_with_separator
     template<typename FormatContext>
-    auto format(const T& v, FormatContext& ctx) OIIO_FMT_CUSTOM_FORMATTER_CONST
+    auto format(const T& v, FormatContext& ctx) const
     {
         std::string vspec = elem_fmt.size() ? fmt::format("{{:{}}}", elem_fmt)
                                             : std::string("{}");
         for (int i = 0; i < Size; ++i) {
             if (i)
                 fmt::format_to(ctx.out(), "{}", sep == ',' ? ", " : " ");
-#if FMT_VERSION >= 80000
             fmt::format_to(ctx.out(), fmt::runtime(vspec),
                            ((const Elem*)&v)[i]);
-#else
-            fmt::format_to(ctx.out(), vspec, ((const Elem*)&v)[i]);
-#endif
         }
         return ctx.out();
     }
diff --git a/src/include/OpenImageIO/strutil.h b/src/include/OpenImageIO/strutil.h
index c446c49689..f06518d78f 100644
--- a/src/include/OpenImageIO/strutil.h
+++ b/src/include/OpenImageIO/strutil.h
@@ -204,7 +204,6 @@ namespace sync {
 /// Output is fully thread-safe (the outputs are "atomic" to the file or
 /// stream), and if the stream is buffered, it is flushed after the output).
 
-#if FMT_VERSION >= 70000
 template<typename Str, typename... Args>
 inline void print (FILE *file, const Str& fmt, Args&&... args)
 {
@@ -223,26 +222,6 @@ inline void print (std::ostream &file, const Str& fmt, Args&&... args)
     sync_output (file, ::fmt::vformat(fmt, ::fmt::make_format_args(args...)));
 }
 
-#else
-
-template<typename... Args>
-inline void print (FILE *file, const char* fmt, Args&&... args)
-{
-    sync_output (file, ::fmt::format(fmt, std::forward<Args>(args)...));
-}
-
-template<typename... Args>
-inline void print (const char* fmt, Args&&... args)
-{
-    print(stdout, fmt, std::forward<Args>(args)...);
-}
-
-template<typename... Args>
-inline void print (std::ostream &file, const char* fmt, Args&&... args)
-{
-    sync_output (file, ::fmt::format(fmt, std::forward<Args>(args)...));
-}
-#endif
 } // namespace sync
 
 
@@ -275,7 +254,7 @@ void print (FILE *file, const char* fmt, const Args&... args);
 template<typename... Args>
 void print (std::ostream &file, const char* fmt, const Args&... args);
 
-#elif FMT_VERSION >= 70000 && !OIIO_PRINT_IS_SYNCHRONIZED
+#elif !OIIO_PRINT_IS_SYNCHRONIZED
 using ::fmt::print;
 #else
 using sync::print;
diff --git a/src/include/OpenImageIO/typedesc.h b/src/include/OpenImageIO/typedesc.h
index b546dcd0e0..94eee60633 100644
--- a/src/include/OpenImageIO/typedesc.h
+++ b/src/include/OpenImageIO/typedesc.h
@@ -681,7 +681,7 @@ struct fmt::formatter<OIIO::TypeDesc> {
     }
 
     template <typename FormatContext>
-    auto format(const OIIO::TypeDesc& t, FormatContext& ctx) OIIO_FMT_CUSTOM_FORMATTER_CONST
+    auto format(const OIIO::TypeDesc& t, FormatContext& ctx) const
     {
         // C++14:   auto format(const OIIO::TypeDesc& p, FormatContext& ctx) const {
         // ctx.out() is an output iterator to write to.
diff --git a/src/include/OpenImageIO/ustring.h b/src/include/OpenImageIO/ustring.h
index aba90ffdf5..b3e172dea3 100644
--- a/src/include/OpenImageIO/ustring.h
+++ b/src/include/OpenImageIO/ustring.h
@@ -1146,8 +1146,7 @@ FMT_BEGIN_NAMESPACE
 
 template<> struct formatter<OIIO::ustring> : formatter<fmt::string_view, char> {
     template<typename FormatContext>
-    auto format(const OIIO::ustring& u,
-                FormatContext& ctx) OIIO_FMT_CUSTOM_FORMATTER_CONST
+    auto format(const OIIO::ustring& u, FormatContext& ctx) const
     {
         return formatter<fmt::string_view, char>::format({ u.data(), u.size() },
                                                          ctx);
@@ -1157,8 +1156,7 @@ template<> struct formatter<OIIO::ustring> : formatter<fmt::string_view, char> {
 template<>
 struct formatter<OIIO::ustringhash> : formatter<fmt::string_view, char> {
     template<typename FormatContext>
-    auto format(const OIIO::ustringhash& h,
-                FormatContext& ctx) OIIO_FMT_CUSTOM_FORMATTER_CONST
+    auto format(const OIIO::ustringhash& h, FormatContext& ctx) const
     {
         OIIO::ustring u(h);
         return formatter<fmt::string_view, char>::format({ u.data(), u.size() },
diff --git a/src/libutil/typedesc_test.cpp b/src/libutil/typedesc_test.cpp
index b7c2c57ae8..cb6f7b2b3d 100644
--- a/src/libutil/typedesc_test.cpp
+++ b/src/libutil/typedesc_test.cpp
@@ -69,10 +69,7 @@ test_type(string_view textrep, TypeDesc constructed,
         tostring_formatting fm(tostring_formatting::STDFORMAT);
         fm.aggregate_sep = ", ";
         fm.array_sep     = ", ";
-#if FMT_VERSION < 70100
-        fm.float_fmt = "{:g}";
-#endif
-        std::string s = tostring(constructed, &value, fm);
+        std::string s    = tostring(constructed, &value, fm);
         if (valuerep.size()) {
             OIIO_CHECK_EQUAL(s, valuerep);
             Strutil::print("  {}\n", s);
diff --git a/src/libutil/ustring_test.cpp b/src/libutil/ustring_test.cpp
index 6e0a883401..3b4ff692e4 100644
--- a/src/libutil/ustring_test.cpp
+++ b/src/libutil/ustring_test.cpp
@@ -17,9 +17,7 @@
 #include <OpenImageIO/unittest.h>
 #include <OpenImageIO/ustring.h>
 
-#if FMT_VERSION >= 90000
-#    include <OpenImageIO/detail/fmt/std.h>
-#endif
+#include <OpenImageIO/detail/fmt/std.h>
 
 
 using namespace OIIO;

From 77c1bf6396075447967721c186d5a17c48f9b90b Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Mon, 23 Feb 2026 12:32:49 -0800
Subject: [PATCH 63/70] ci: temporarily disable python stub checking (#5061)

The CI stub generation has been broken for a few days, failing CI every
time. The checked-in stub files seem fine. Just turn off this check
until we can figure out why it is broken.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 pyproject.toml | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 29e7ede18d..04f176cbd4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -131,23 +131,28 @@ WebP_BUILD_VERSION = "1.5.0"
 [tool.cibuildwheel.windows.environment]
 SKBUILD_CMAKE_BUILD_TYPE = "MinSizeRel"
 
-[[tool.cibuildwheel.overrides]]
-# Trigger the build & validation of the python stubs for certain platforms.
-# The test command acts as a kind of "post-build" callback where it's possible
-# for the stub-generator to import the freshly-built wheel.
-# There are two entry-points which are designed to call generate_stubs.py through
-# this test command:
-# - `make pystubs` is called during local development to generate the
-#   stubs and copy them into the git repo to be committed and reviewed.
-# - in CI, the cibuildwheel action is used to validate that the stubs match what
-#   has been committed to the repo.
-test-requires = "mypy~=1.15.0 stubgenlib~=0.1.0"
-# Note: the python version here must be kept in sync with src/python/stubs/CMakeLists.txt
-select = "cp311-manylinux_*64"
-inherit.test-command = "append"
-test-command = [
-    "python {project}/src/python/stubs/generate_stubs.py --out-path '/output' --validate-path '{project}/src/python/stubs/OpenImageIO/__init__.pyi'",
-]
+# Temporarily disabled test below. The CI stub generation seems broken, and
+# is failing CI every time. Just turn off this check until we can figure out
+# why it is broken.
+# ----
+# [[tool.cibuildwheel.overrides]]
+# # Trigger the build & validation of the python stubs for certain platforms.
+# # The test command acts as a kind of "post-build" callback where it's possible
+# # for the stub-generator to import the freshly-built wheel.
+# # There are two entry-points which are designed to call generate_stubs.py through
+# # this test command:
+# # - `make pystubs` is called during local development to generate the
+# #   stubs and copy them into the git repo to be committed and reviewed.
+# # - in CI, the cibuildwheel action is used to validate that the stubs match what
+# #   has been committed to the repo.
+# test-requires = "mypy~=1.15.0 stubgenlib~=0.1.0"
+# # Note: the python version here must be kept in sync with src/python/stubs/CMakeLists.txt
+# select = "cp311-manylinux_*64"
+# inherit.test-command = "append"
+# test-command = [
+#     "python {project}/src/python/stubs/generate_stubs.py --out-path '/output' --validate-path '{project}/src/python/stubs/OpenImageIO/__init__.pyi'",
+# ]
+# ----
 
 [tool.mypy]
 files = [

From a841554ddbcc7fd8e37a7585ed740254b6e45ac7 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Mon, 23 Feb 2026 20:31:49 -0800
Subject: [PATCH 64/70] fix: address fmath.h warning with ispow2 (#5033)

I was seeing warnings with instantiation of the ispow2 function template
for unsigned type, where the `x >= 0` clause is meaningless. Use a
constexpr if to eliminate that pointless test for unsigned types.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/include/OpenImageIO/fmath.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/include/OpenImageIO/fmath.h b/src/include/OpenImageIO/fmath.h
index bb73ff14ea..ea31eb43ce 100644
--- a/src/include/OpenImageIO/fmath.h
+++ b/src/include/OpenImageIO/fmath.h
@@ -144,7 +144,11 @@ ispow2(T x) noexcept
     // Numerous references for this bit trick are on the web.  The
     // principle is that x is a power of 2 <=> x == 1<<b <=> x-1 is
     // all 1 bits for bits < b.
-    return (x & (x - 1)) == 0 && (x >= 0);
+    if constexpr (std::is_signed<T>::value) {
+        return (x & (x - 1)) == 0 && (x >= 0);
+    } else {
+        return (x & (x - 1)) == 0;
+    }
 }
 
 
From ecdeabffd0672a8835a98dd6edd1fb4a4f847345 Mon Sep 17 00:00:00 2001
From: Shane Smith <shane.smith@dreamworks.com>
Date: Mon, 23 Feb 2026 20:36:56 -0800
Subject: [PATCH 65/70] jxl: Extending JXL CICP support to include P3 / color
 primaries 12 (#5054)

I tested out the JPEG XL CICP support and noticed that color primaries
12 was not supported. This pull request is looking to extend P3 support
for color primaries 12.
Note: color primaries 11 uses the DCI white point and color primaries 12
uses the D65 white point.

The JxlPrimaries enum only covers P3 primaries as value 11 and not 12.
See,

https://github.com/libjxl/libjxl/blob/main/lib/include/jxl/color_encoding.h#L55-L75
Further code is therefore required to account for this on read and
write.

Tests for read and write of color primaries 11 and 12 were added.

Signed-off-by: Shane Smith <shane.smith@dreamworks.com>
Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/jpegxl.imageio/jxlinput.cpp  | 12 ++++++---
 src/jpegxl.imageio/jxloutput.cpp | 18 +++++++++----
 testsuite/jxl/ref/out.txt        | 45 ++++++++++++++++++++++++++++++++
 testsuite/jxl/run.py             |  6 +++++
 4 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/src/jpegxl.imageio/jxlinput.cpp b/src/jpegxl.imageio/jxlinput.cpp
index b0f90cf4f1..88f2ccea06 100644
--- a/src/jpegxl.imageio/jxlinput.cpp
+++ b/src/jpegxl.imageio/jxlinput.cpp
@@ -383,9 +383,15 @@ JxlInput::open(const std::string& name, ImageSpec& newspec)
     if (have_color_encoding && color_encoding.primaries != JXL_PRIMARIES_CUSTOM
         && color_encoding.white_point != JXL_WHITE_POINT_CUSTOM
         && color_encoding.transfer_function != JXL_TRANSFER_FUNCTION_GAMMA) {
-        const int cicp[4] = { color_encoding.primaries,
-                              color_encoding.transfer_function, 0 /* RGB */,
-                              1 /* Full range */ };
+        int color_primaries = color_encoding.primaries;
+        // JxlPrimaries enum only covers P3 primaries as value 11 and not 12
+        // but CICP has separate code values based on white point.
+        if (color_primaries == JXL_PRIMARIES_P3
+            && color_encoding.white_point == JXL_WHITE_POINT_D65) {
+            color_primaries = 12;
+        }
+        const int cicp[4] = { color_primaries, color_encoding.transfer_function,
+                              0 /* RGB */, 1 /* Full range */ };
         m_spec.attribute("CICP", TypeDesc(TypeDesc::INT, 4), cicp);
         const ColorConfig& colorconfig(ColorConfig::default_colorconfig());
         string_view interop_id = colorconfig.get_color_interop_id(cicp);
diff --git a/src/jpegxl.imageio/jxloutput.cpp b/src/jpegxl.imageio/jxloutput.cpp
index 1a4a64e858..37fff33817 100644
--- a/src/jpegxl.imageio/jxloutput.cpp
+++ b/src/jpegxl.imageio/jxloutput.cpp
@@ -571,7 +571,18 @@ JxlOutput::save_image(const void* data)
         // primaries and white point are not currently used but could help
         // support more CICP codes.
         JxlColorEncoding color_encoding {};
-        color_encoding.primaries         = JxlPrimaries(cicp[0]);
+        color_encoding.primaries = JxlPrimaries(cicp[0]);
+        // CICP primaries 11 and 12 both represent P3, but with different white points.
+        if (cicp[0] == 11) {
+            color_encoding.white_point = JXL_WHITE_POINT_DCI;
+        }
+        // JxlPrimaries enum only covers P3 primaries as value 11 and not 12.
+        else if (cicp[0] == 12) {
+            color_encoding.primaries   = JXL_PRIMARIES_P3;
+            color_encoding.white_point = JXL_WHITE_POINT_D65;
+        } else {
+            color_encoding.white_point = JXL_WHITE_POINT_D65;
+        }
         color_encoding.transfer_function = JxlTransferFunction(cicp[1]);
         color_encoding.color_space       = JXL_COLOR_SPACE_RGB;
 
@@ -581,10 +592,7 @@ JxlOutput::save_image(const void* data)
         switch (color_encoding.primaries) {
         case JXL_PRIMARIES_SRGB:
         case JXL_PRIMARIES_2100:
-        case JXL_PRIMARIES_P3:
-            supported_primaries        = true;
-            color_encoding.white_point = JXL_WHITE_POINT_D65;
-            break;
+        case JXL_PRIMARIES_P3: supported_primaries = true; break;
         case JXL_PRIMARIES_CUSTOM:  // Not an actual CICP code in JXL
             break;
         }
diff --git a/testsuite/jxl/ref/out.txt b/testsuite/jxl/ref/out.txt
index 83b97fbba6..0ed56fe485 100644
--- a/testsuite/jxl/ref/out.txt
+++ b/testsuite/jxl/ref/out.txt
@@ -42,3 +42,48 @@ tahoe-cicp-pq.jxl    :  128 x   96, 3 channel, uint8 jpegxl
     ICCProfile:profile_version: "4.4.0"
     ICCProfile:rendering_intent: "Perceptual"
     oiio:ColorSpace: "pq_rec2020_display"
+Reading tahoe-cicp-dcip3.jxl
+tahoe-cicp-dcip3.jxl :  128 x   96, 3 channel, uint8 jpegxl
+    SHA-1: 069F1A3E5567349C2D34E535B29913029EF1B09C
+    channel list: R, G, B
+    CICP: 11, 17, 0, 1
+    ICCProfile: 0, 0, 2, 24, 106, 120, 108, 32, 4, 64, 0, 0, 109, 110, 116, 114, ... [536 x uint8]
+    ICCProfile:attributes: "Reflective, Glossy, Positive, Color"
+    ICCProfile:cmm_type: 1786276896
+    ICCProfile:color_space: "RGB"
+    ICCProfile:copyright: "CC0"
+    ICCProfile:creation_date: "2019:12:01 00:00:00"
+    ICCProfile:creator_signature: "6a786c20"
+    ICCProfile:device_class: "Display device profile"
+    ICCProfile:flags: "Not Embedded, Independent"
+    ICCProfile:manufacturer: "0"
+    ICCProfile:model: "0"
+    ICCProfile:platform_signature: "Apple Computer, Inc."
+    ICCProfile:profile_connection_space: "XYZ"
+    ICCProfile:profile_description: "RGB_DCI_DCI_Per_DCI"
+    ICCProfile:profile_size: 536
+    ICCProfile:profile_version: "4.4.0"
+    ICCProfile:rendering_intent: "Perceptual"
+Reading tahoe-cicp-displayp3.jxl
+tahoe-cicp-displayp3.jxl :  128 x   96, 3 channel, uint8 jpegxl
+    SHA-1: 069F1A3E5567349C2D34E535B29913029EF1B09C
+    channel list: R, G, B
+    CICP: 12, 13, 0, 1
+    ICCProfile: 0, 0, 2, 4, 106, 120, 108, 32, 4, 64, 0, 0, 109, 110, 116, 114, ... [516 x uint8]
+    ICCProfile:attributes: "Reflective, Glossy, Positive, Color"
+    ICCProfile:cmm_type: 1786276896
+    ICCProfile:color_space: "RGB"
+    ICCProfile:copyright: "CC0"
+    ICCProfile:creation_date: "2019:12:01 00:00:00"
+    ICCProfile:creator_signature: "6a786c20"
+    ICCProfile:device_class: "Display device profile"
+    ICCProfile:flags: "Not Embedded, Independent"
+    ICCProfile:manufacturer: "0"
+    ICCProfile:model: "0"
+    ICCProfile:platform_signature: "Apple Computer, Inc."
+    ICCProfile:profile_connection_space: "XYZ"
+    ICCProfile:profile_description: "DisplayP3"
+    ICCProfile:profile_size: 516
+    ICCProfile:profile_version: "4.4.0"
+    ICCProfile:rendering_intent: "Perceptual"
+    oiio:ColorSpace: "srgb_p3d65_scene"
diff --git a/testsuite/jxl/run.py b/testsuite/jxl/run.py
index 78b7a19eba..a3299e02a9 100755
--- a/testsuite/jxl/run.py
+++ b/testsuite/jxl/run.py
@@ -13,6 +13,12 @@
 command += oiiotool ("../common/tahoe-tiny.tif --cicp \"9,16,9,1\" -o tahoe-cicp-pq.jxl")
 command += info_command ("tahoe-cicp-pq.jxl", safematch=True)
 
+command += oiiotool ("../common/tahoe-tiny.tif --cicp \"11,17,0,1\" -o tahoe-cicp-dcip3.jxl")
+command += info_command ("tahoe-cicp-dcip3.jxl", safematch=True)
+
+command += oiiotool ("../common/tahoe-tiny.tif --cicp \"12,13,0,1\" -o tahoe-cicp-displayp3.jxl")
+command += info_command ("tahoe-cicp-displayp3.jxl", safematch=True)
+
 outputs = [
             "test-jxl.icc",
             "out.txt"

From 429b57d3b75cf0634592a4652c154480a056cb1c Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 24 Feb 2026 16:36:49 +0900
Subject: [PATCH 66/70] clang-format

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebufalgo_addsub.cpp | 42 ++++++-----
 src/libOpenImageIO/imagebufalgo_hwy_pvt.h  | 81 +++++++++++-----------
 src/libOpenImageIO/imagebufalgo_mad.cpp    | 12 ++--
 src/libOpenImageIO/imagebufalgo_muldiv.cpp | 22 +++---
 4 files changed, 76 insertions(+), 81 deletions(-)

diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index 79b89e8204..01ba5daa19 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -72,9 +72,10 @@ add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
                         ROI roi, int nthreads)
 {
     return hwy_binary_native_int_perpixel_op<T>(R, A, B, roi, nthreads,
-                                               [](auto /*d*/, auto a, auto b) {
-                                                   return hn::SaturatedAdd(a, b);
-                                               });
+                                                [](auto /*d*/, auto a, auto b) {
+                                                    return hn::SaturatedAdd(a,
+                                                                            b);
+                                                });
 }
 
 template<class Rtype, class Atype, class Btype>
@@ -82,9 +83,7 @@ static bool
 add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
-    auto op = [](auto /*d*/, auto a, auto b) {
-        return hn::Add(a, b);
-    };
+    auto op = [](auto /*d*/, auto a, auto b) { return hn::Add(a, b); };
 
     // Handle packed RGBA images with an RGB ROI (preserve alpha).
     if constexpr (std::is_integral_v<Rtype> && std::is_same_v<Rtype, Atype>
@@ -93,8 +92,8 @@ add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
             return hn::SaturatedAdd(a, b);
         };
         if (hwy_binary_native_int_perpixel_op_rgba_rgb_roi<Rtype>(R, A, B, roi,
-                                                                 nthreads,
-                                                                 op_int))
+                                                                  nthreads,
+                                                                  op_int))
             return true;
     }
     if (hwy_binary_perpixel_op_rgba_rgb_roi<Rtype, Atype, Btype>(R, A, B, roi,
@@ -142,9 +141,9 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
         && B.localpixels()) {
-        auto Rv = HwyPixels(R);
-        auto Av = HwyPixels(A);
-        auto Bv = HwyPixels(B);
+        auto Rv             = HwyPixels(R);
+        auto Av             = HwyPixels(A);
+        auto Bv             = HwyPixels(B);
         const int nchannels = RoiNChannels(roi);
         const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
                             && ChannelsContiguous<Atype>(Av, nchannels)
@@ -196,9 +195,10 @@ sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
                         ROI roi, int nthreads)
 {
     return hwy_binary_native_int_perpixel_op<T>(R, A, B, roi, nthreads,
-                                               [](auto /*d*/, auto a, auto b) {
-                                                   return hn::SaturatedSub(a, b);
-                                               });
+                                                [](auto /*d*/, auto a, auto b) {
+                                                    return hn::SaturatedSub(a,
+                                                                            b);
+                                                });
 }
 
 template<class Rtype, class Atype, class Btype>
@@ -206,9 +206,7 @@ static bool
 sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
-    auto op = [](auto /*d*/, auto a, auto b) {
-        return hn::Sub(a, b);
-    };
+    auto op = [](auto /*d*/, auto a, auto b) { return hn::Sub(a, b); };
 
     // Handle packed RGBA images with an RGB ROI (preserve alpha).
     if constexpr (std::is_integral_v<Rtype> && std::is_same_v<Rtype, Atype>
@@ -217,8 +215,8 @@ sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
             return hn::SaturatedSub(a, b);
         };
         if (hwy_binary_native_int_perpixel_op_rgba_rgb_roi<Rtype>(R, A, B, roi,
-                                                                 nthreads,
-                                                                 op_int))
+                                                                  nthreads,
+                                                                  op_int))
             return true;
     }
     if (hwy_binary_perpixel_op_rgba_rgb_roi<Rtype, Atype, Btype>(R, A, B, roi,
@@ -243,9 +241,9 @@ sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
         && B.localpixels()) {
-        auto Rv = HwyPixels(R);
-        auto Av = HwyPixels(A);
-        auto Bv = HwyPixels(B);
+        auto Rv             = HwyPixels(R);
+        auto Av             = HwyPixels(A);
+        auto Bv             = HwyPixels(B);
         const int nchannels = RoiNChannels(roi);
         const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
                             && ChannelsContiguous<Atype>(Av, nchannels)
diff --git a/src/libOpenImageIO/imagebufalgo_hwy_pvt.h b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
index 0083002615..b97d4b8c30 100644
--- a/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
+++ b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
@@ -132,8 +132,7 @@ LoadInterleaved4PromoteN(D d, const SrcT* ptr, size_t count);
 template<class D, typename DstT, typename VecMathT, typename VecAlphaLaneT>
 inline void
 StoreInterleaved4RgbAlphaPassthrough(D d, DstT* ptr, VecMathT r, VecMathT g,
-                                     VecMathT b,
-                                     VecAlphaLaneT a_passthrough);
+                                     VecMathT b, VecAlphaLaneT a_passthrough);
 
 // -----------------------------------------------------------------------
 // Load and Promote
@@ -267,8 +266,8 @@ PromoteVec(D d, VecT v)
         return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 4294967295.0)));
     } else if constexpr (std::is_same_v<SrcT, int32_t>) {
         auto v_promoted = hn::ConvertTo(d, v);
-        auto v_norm = hn::Mul(v_promoted,
-                              hn::Set(d, (MathT)(1.0 / 2147483647.0)));
+        auto v_norm     = hn::Mul(v_promoted,
+                                  hn::Set(d, (MathT)(1.0 / 2147483647.0)));
         return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
     } else if constexpr (std::is_same_v<SrcT, uint64_t>) {
         auto d_u32 = hn::Rebind<uint32_t, D>();
@@ -551,10 +550,10 @@ DemoteVec(D d, VecT v)
         auto d_u8  = hn::Rebind<uint8_t, D>();
         return hn::DemoteTo(d_u8, v_i16);
     } else if constexpr (std::is_same_v<DstT, int8_t>) {
-        VecD v_denorm = hn::Mul((VecD)v, hn::Set(d, (MathT)127.0));
-        auto is_neg   = hn::Lt(v_denorm, hn::Zero(d));
-        auto v_bias   = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
-                                       hn::Set(d, (MathT)0.5));
+        VecD v_denorm  = hn::Mul((VecD)v, hn::Set(d, (MathT)127.0));
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
         VecD v_rounded = hn::Add(v_denorm, v_bias);
         VecD v_clamped = hn::Max(v_rounded, hn::Set(d, (MathT)-128.0));
         v_clamped      = hn::Min(v_clamped, hn::Set(d, (MathT)127.0));
@@ -576,10 +575,10 @@ DemoteVec(D d, VecT v)
         auto d_u16 = hn::Rebind<uint16_t, D>();
         return hn::DemoteTo(d_u16, vi32);
     } else if constexpr (std::is_same_v<DstT, int16_t>) {
-        VecD v_denorm = hn::Mul((VecD)v, hn::Set(d, (MathT)32767.0));
-        auto is_neg   = hn::Lt(v_denorm, hn::Zero(d));
-        auto v_bias   = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
-                                       hn::Set(d, (MathT)0.5));
+        VecD v_denorm  = hn::Mul((VecD)v, hn::Set(d, (MathT)32767.0));
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
         VecD v_rounded = hn::Add(v_denorm, v_bias);
         VecD v_clamped = hn::Max(v_rounded, hn::Set(d, (MathT)-32768.0));
         v_clamped      = hn::Min(v_clamped, hn::Set(d, (MathT)32767.0));
@@ -595,10 +594,10 @@ DemoteVec(D d, VecT v)
         auto d_u32     = hn::Rebind<uint32_t, D>();
         return hn::ConvertTo(d_u32, v_clamped);
     } else if constexpr (std::is_same_v<DstT, int32_t>) {
-        VecD v_denorm = hn::Mul((VecD)v, hn::Set(d, (MathT)2147483647.0));
-        auto is_neg   = hn::Lt(v_denorm, hn::Zero(d));
-        auto v_bias   = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
-                                       hn::Set(d, (MathT)0.5));
+        VecD v_denorm  = hn::Mul((VecD)v, hn::Set(d, (MathT)2147483647.0));
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
         VecD v_rounded = hn::Add(v_denorm, v_bias);
         VecD v_clamped = hn::Max(v_rounded, hn::Set(d, (MathT)-2147483648.0));
         v_clamped      = hn::Min(v_clamped, hn::Set(d, (MathT)2147483647.0));
@@ -932,15 +931,15 @@ RunHwyTernaryCmd(Rtype* r, const ABCtype* a, const ABCtype* b, const ABCtype* c,
 /// covers the full pixel).
 template<typename Rtype, typename Atype, typename Btype, typename OpFunc>
 inline bool
-hwy_binary_perpixel_op(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-                       int nthreads, OpFunc op)
+hwy_binary_perpixel_op(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                       ROI roi, int nthreads, OpFunc op)
 {
     auto Rv = HwyPixels(R);
     auto Av = HwyPixels(A);
     auto Bv = HwyPixels(B);
     ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
         const int nchannels = RoiNChannels(roi);
-        const size_t n = static_cast<size_t>(roi.width())
+        const size_t n      = static_cast<size_t>(roi.width())
                          * static_cast<size_t>(nchannels);
         for (int y = roi.ybegin; y < roi.yend; ++y) {
             Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
@@ -967,7 +966,7 @@ hwy_ternary_perpixel_op(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
     auto Cv = HwyPixels(C);
     ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
         const int nchannels = RoiNChannels(roi);
-        const size_t n = static_cast<size_t>(roi.width())
+        const size_t n      = static_cast<size_t>(roi.width())
                          * static_cast<size_t>(nchannels);
         for (int y = roi.ybegin; y < roi.yend; ++y) {
             Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi);
@@ -994,7 +993,7 @@ hwy_binary_native_int_perpixel_op(ImageBuf& R, const ImageBuf& A,
     auto Bv = HwyPixels(B);
     ImageBufAlgo::parallel_image(roi, nthreads, [&, op](ROI roi) {
         const int nchannels = RoiNChannels(roi);
-        const size_t n = static_cast<size_t>(roi.width())
+        const size_t n      = static_cast<size_t>(roi.width())
                          * static_cast<size_t>(nchannels);
         for (int y = roi.ybegin; y < roi.yend; ++y) {
             T* r_row       = RoiRowPtr<T>(Rv, y, roi);
@@ -1063,9 +1062,9 @@ hwy_binary_perpixel_op_rgba_rgb_roi(ImageBuf& R, const ImageBuf& A,
 
     ImageBufAlgo::parallel_image(roi4, nthreads, [&, op](ROI roi4) {
         for (int y = roi4.ybegin; y < roi4.yend; ++y) {
-            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi4);
-            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi4);
-            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi4);
+            Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi4);
+            const Atype* a_row   = RoiRowPtr<Atype>(Av, y, roi4);
+            const Btype* b_row   = RoiRowPtr<Btype>(Bv, y, roi4);
             const size_t npixels = static_cast<size_t>(roi4.width());
 
             size_t x = 0;
@@ -1081,8 +1080,8 @@ hwy_binary_perpixel_op_rgba_rgb_roi(ImageBuf& R, const ImageBuf& A,
                 auto d_dstlane = hn::Rebind<DstLaneT, decltype(d)>();
                 hn::Vec<decltype(d_dstlane)> dr, dg, db, da;
                 hn::LoadInterleaved4(d_dstlane,
-                                     reinterpret_cast<const DstLaneT*>(
-                                         r_row + off),
+                                     reinterpret_cast<const DstLaneT*>(r_row
+                                                                       + off),
                                      dr, dg, db, da);
                 (void)dr;
                 (void)dg;
@@ -1097,11 +1096,11 @@ hwy_binary_perpixel_op_rgba_rgb_roi(ImageBuf& R, const ImageBuf& A,
 
             const size_t remaining = npixels - x;
             if (remaining > 0) {
-                const size_t off = x * 4;
-                auto [ar, ag, ab, aa]
-                    = LoadInterleaved4PromoteN(d, a_row + off, remaining);
-                auto [br, bg, bb, ba]
-                    = LoadInterleaved4PromoteN(d, b_row + off, remaining);
+                const size_t off      = x * 4;
+                auto [ar, ag, ab, aa] = LoadInterleaved4PromoteN(d, a_row + off,
+                                                                 remaining);
+                auto [br, bg, bb, ba] = LoadInterleaved4PromoteN(d, b_row + off,
+                                                                 remaining);
                 (void)aa;
                 (void)ba;
                 auto rr = op(d, ar, br);
@@ -1170,8 +1169,8 @@ hwy_ternary_perpixel_op_rgba_rgb_roi(ImageBuf& R, const ImageBuf& A,
                 auto d_dstlane = hn::Rebind<DstLaneT, decltype(d)>();
                 hn::Vec<decltype(d_dstlane)> dr, dg, db, da;
                 hn::LoadInterleaved4(d_dstlane,
-                                     reinterpret_cast<const DstLaneT*>(
-                                         r_row + off),
+                                     reinterpret_cast<const DstLaneT*>(r_row
+                                                                       + off),
                                      dr, dg, db, da);
                 (void)dr;
                 (void)dg;
@@ -1186,13 +1185,13 @@ hwy_ternary_perpixel_op_rgba_rgb_roi(ImageBuf& R, const ImageBuf& A,
 
             const size_t remaining = npixels - x;
             if (remaining > 0) {
-                const size_t off = x * 4;
-                auto [ar, ag, ab, aa]
-                    = LoadInterleaved4PromoteN(d, a_row + off, remaining);
-                auto [br, bg, bb, ba]
-                    = LoadInterleaved4PromoteN(d, b_row + off, remaining);
-                auto [cr, cg, cb, ca]
-                    = LoadInterleaved4PromoteN(d, c_row + off, remaining);
+                const size_t off      = x * 4;
+                auto [ar, ag, ab, aa] = LoadInterleaved4PromoteN(d, a_row + off,
+                                                                 remaining);
+                auto [br, bg, bb, ba] = LoadInterleaved4PromoteN(d, b_row + off,
+                                                                 remaining);
+                auto [cr, cg, cb, ca] = LoadInterleaved4PromoteN(d, c_row + off,
+                                                                 remaining);
                 (void)aa;
                 (void)ba;
                 (void)ca;
@@ -1246,7 +1245,7 @@ hwy_binary_native_int_perpixel_op_rgba_rgb_roi(ImageBuf& R, const ImageBuf& A,
             T* r_row       = RoiRowPtr<T>(Rv, y, roi4);
             const T* a_row = RoiRowPtr<T>(Av, y, roi4);
             const T* b_row = RoiRowPtr<T>(Bv, y, roi4);
-            size_t i = 0;
+            size_t i       = 0;
             for (; i + lanes <= n; i += lanes) {
                 auto va   = hn::Load(d, a_row + i);
                 auto vb   = hn::Load(d, b_row + i);
diff --git a/src/libOpenImageIO/imagebufalgo_mad.cpp b/src/libOpenImageIO/imagebufalgo_mad.cpp
index eb20c81f0b..30ad86e9e1 100644
--- a/src/libOpenImageIO/imagebufalgo_mad.cpp
+++ b/src/libOpenImageIO/imagebufalgo_mad.cpp
@@ -56,8 +56,8 @@ mad_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
         return hn::MulAdd(a, b, c);
     };
 
-    if (hwy_ternary_perpixel_op_rgba_rgb_roi<Rtype, ABCtype>(
-            R, A, B, C, roi, nthreads, op))
+    if (hwy_ternary_perpixel_op_rgba_rgb_roi<Rtype, ABCtype>(R, A, B, C, roi,
+                                                             nthreads, op))
         return true;
 
     return hwy_ternary_perpixel_op<Rtype, ABCtype>(R, A, B, C, roi, nthreads,
@@ -73,10 +73,10 @@ mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C,
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
         && B.localpixels() && C.localpixels()) {
-        auto Rv = HwyPixels(R);
-        auto Av = HwyPixels(A);
-        auto Bv = HwyPixels(B);
-        auto Cv = HwyPixels(C);
+        auto Rv             = HwyPixels(R);
+        auto Av             = HwyPixels(A);
+        auto Bv             = HwyPixels(B);
+        auto Cv             = HwyPixels(C);
         const int nchannels = RoiNChannels(roi);
         const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
                             && ChannelsContiguous<ABCtype>(Av, nchannels)
diff --git a/src/libOpenImageIO/imagebufalgo_muldiv.cpp b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
index cbfeb07d75..97d2c94faf 100644
--- a/src/libOpenImageIO/imagebufalgo_muldiv.cpp
+++ b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
@@ -131,9 +131,7 @@ static bool
 mul_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
-    auto op = [](auto /*d*/, auto a, auto b) {
-        return hn::Mul(a, b);
-    };
+    auto op = [](auto /*d*/, auto a, auto b) { return hn::Mul(a, b); };
 
     if (hwy_binary_perpixel_op_rgba_rgb_roi<Rtype, Atype, Btype>(R, A, B, roi,
                                                                  nthreads, op))
@@ -180,9 +178,9 @@ mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
         && B.localpixels()) {
-        auto Rv = HwyPixels(R);
-        auto Av = HwyPixels(A);
-        auto Bv = HwyPixels(B);
+        auto Rv             = HwyPixels(R);
+        auto Av             = HwyPixels(A);
+        auto Bv             = HwyPixels(B);
         const int nchannels = RoiNChannels(roi);
         const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
                             && ChannelsContiguous<Atype>(Av, nchannels)
@@ -323,9 +321,9 @@ div_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
              int nthreads)
 {
     auto op = [](auto d, auto a, auto b) {
-        const auto zero = hn::Zero(d);
-        const auto nz   = hn::Ne(b, zero);
-        const auto one  = hn::Set(d, 1);
+        const auto zero   = hn::Zero(d);
+        const auto nz     = hn::Ne(b, zero);
+        const auto one    = hn::Set(d, 1);
         const auto safe_b = hn::IfThenElse(nz, b, one);
         const auto q      = hn::Div(a, safe_b);
         return hn::IfThenElse(nz, q, zero);
@@ -348,9 +346,9 @@ div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 #if defined(OIIO_USE_HWY) && OIIO_USE_HWY
     if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
         && B.localpixels()) {
-        auto Rv = HwyPixels(R);
-        auto Av = HwyPixels(A);
-        auto Bv = HwyPixels(B);
+        auto Rv             = HwyPixels(R);
+        auto Av             = HwyPixels(A);
+        auto Bv             = HwyPixels(B);
         const int nchannels = RoiNChannels(roi);
         const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
                             && ChannelsContiguous<Atype>(Av, nchannels)

From 01575cc62a9c4f0aee6489576c327e6f937f62f3 Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 24 Feb 2026 18:32:41 +0900
Subject: [PATCH 67/70] Remove duplicated resample_deep function

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebufalgo_xform.cpp | 79 -----------------------
 1 file changed, 79 deletions(-)

diff --git a/src/libOpenImageIO/imagebufalgo_xform.cpp b/src/libOpenImageIO/imagebufalgo_xform.cpp
index 3768dc6263..a1f3e14114 100644
--- a/src/libOpenImageIO/imagebufalgo_xform.cpp
+++ b/src/libOpenImageIO/imagebufalgo_xform.cpp
@@ -1441,85 +1441,6 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
 }
 
 
-static bool
-resample_deep(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
-              int nthreads)
-{
-    // If it's deep, figure out the sample allocations first, because
-    // it's not thread-safe to do that simultaneously with copying the
-    // values.
-    const ImageSpec& srcspec(src.spec());
-    const ImageSpec& dstspec(dst.spec());
-    float srcfx          = srcspec.full_x;
-    float srcfy          = srcspec.full_y;
-    float srcfw          = srcspec.full_width;
-    float srcfh          = srcspec.full_height;
-    float dstpixelwidth  = 1.0f / dstspec.full_width;
-    float dstpixelheight = 1.0f / dstspec.full_height;
-    ImageBuf::ConstIterator<float> srcpel(src, roi);
-    ImageBuf::Iterator<float> dstpel(dst, roi);
-    for (; !dstpel.done(); ++dstpel, ++srcpel) {
-        float s   = (dstpel.x() - dstspec.full_x + 0.5f) * dstpixelwidth;
-        float t   = (dstpel.y() - dstspec.full_y + 0.5f) * dstpixelheight;
-        int src_y = ifloor(srcfy + t * srcfh);
-        int src_x = ifloor(srcfx + s * srcfw);
-        srcpel.pos(src_x, src_y, 0);
-        dstpel.set_deep_samples(srcpel.deep_samples());
-    }
-
-    OIIO_ASSERT(src.deep() == dst.deep());
-    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        const ImageSpec& srcspec(src.spec());
-        const ImageSpec& dstspec(dst.spec());
-        int nchannels = src.nchannels();
-
-        // Local copies of the source image window, converted to float
-        float srcfx = srcspec.full_x;
-        float srcfy = srcspec.full_y;
-        float srcfw = srcspec.full_width;
-        float srcfh = srcspec.full_height;
-
-        float dstfx          = dstspec.full_x;
-        float dstfy          = dstspec.full_y;
-        float dstfw          = dstspec.full_width;
-        float dstfh          = dstspec.full_height;
-        float dstpixelwidth  = 1.0f / dstfw;
-        float dstpixelheight = 1.0f / dstfh;
-
-        ImageBuf::Iterator<float> out(dst, roi);
-        ImageBuf::ConstIterator<float> srcpel(src);
-        for (int y = roi.ybegin; y < roi.yend; ++y) {
-            // s,t are NDC space
-            float t = (y - dstfy + 0.5f) * dstpixelheight;
-            // src_xf, src_xf are image space float coordinates
-            float src_yf = srcfy + t * srcfh;
-            // src_x, src_y are image space integer coordinates of the floor
-            int src_y = ifloor(src_yf);
-            for (int x = roi.xbegin; x < roi.xend; ++x, ++out) {
-                float s      = (x - dstfx + 0.5f) * dstpixelwidth;
-                float src_xf = srcfx + s * srcfw;
-                int src_x    = ifloor(src_xf);
-                srcpel.pos(src_x, src_y, 0);
-                int nsamps = srcpel.deep_samples();
-                OIIO_DASSERT(nsamps == out.deep_samples());
-                if (!nsamps || nsamps != out.deep_samples())
-                    continue;
-                for (int c = 0; c < nchannels; ++c) {
-                    if (dstspec.channelformat(c) == TypeDesc::UINT32)
-                        for (int samp = 0; samp < nsamps; ++samp)
-                            out.set_deep_value(c, samp,
-                                               srcpel.deep_value_uint(c, samp));
-                    else
-                        for (int samp = 0; samp < nsamps; ++samp)
-                            out.set_deep_value(c, samp,
-                                               srcpel.deep_value(c, samp));
-                }
-            }
-        }
-    });
-
-    return true;
-}
 
 
From 30d20c33be776d29acd63fa129096ecd54e68b6f Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 24 Feb 2026 19:01:13 +0900
Subject: [PATCH 68/70] clang-format fix

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebufalgo_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libOpenImageIO/imagebufalgo_test.cpp b/src/libOpenImageIO/imagebufalgo_test.cpp
index ff1604757f..74f798c31a 100644
--- a/src/libOpenImageIO/imagebufalgo_test.cpp
+++ b/src/libOpenImageIO/imagebufalgo_test.cpp
@@ -526,7 +526,7 @@ test_hwy_strided_roi_fallback()
     ImageBufAlgo::fill(B, { 0.1f, 0.3f, 0.5f, 0.7f });
     ImageBufAlgo::fill(C, { 0.05f, 0.05f, 0.05f, 0.05f });
 
-    ROI roi = get_roi(A.spec());
+    ROI roi     = get_roi(A.spec());
     roi.chbegin = 0;
     roi.chend   = 3;  // RGB only => non-contiguous for RGBA interleaving
 

From 9b0e7d5d922e9eb85077559bb9502919ba765a03 Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 24 Feb 2026 19:03:27 +0900
Subject: [PATCH 69/70] clang-format fix

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebufalgo_xform.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/libOpenImageIO/imagebufalgo_xform.cpp b/src/libOpenImageIO/imagebufalgo_xform.cpp
index a1f3e14114..43008a7882 100644
--- a/src/libOpenImageIO/imagebufalgo_xform.cpp
+++ b/src/libOpenImageIO/imagebufalgo_xform.cpp
@@ -1442,8 +1442,6 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
 
 
-
-
 bool
 ImageBufAlgo::resample(ImageBuf& dst, const ImageBuf& src, bool interpolate,
                        ROI roi, int nthreads)

From bcf110f82abd0698abb78b1fcf67a313608e0f8c Mon Sep 17 00:00:00 2001
From: "Vlad (Kuzmin) Erium" <libalias@gmail.com>
Date: Tue, 24 Feb 2026 20:29:27 +0900
Subject: [PATCH 70/70] weird clang-format fix

Signed-off-by: Vlad (Kuzmin) Erium <libalias@gmail.com>
---
 src/libOpenImageIO/imagebufalgo_xform.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/libOpenImageIO/imagebufalgo_xform.cpp b/src/libOpenImageIO/imagebufalgo_xform.cpp
index 43008a7882..7e06e0cbcc 100644
--- a/src/libOpenImageIO/imagebufalgo_xform.cpp
+++ b/src/libOpenImageIO/imagebufalgo_xform.cpp
@@ -936,7 +936,11 @@ ImageBufAlgo::fit(ImageBuf& dst, const ImageBuf& src, KWArgs options, ROI roi,
     OIIO::pvt::LoggedTimer logtime("IBA::fit");
 
     static const ustring recognized[] = {
-        filtername_us, filterwidth_us, filterptr_us, fillmode_us, exact_us,
+        filtername_us,
+        filterwidth_us,
+        filterptr_us,
+        fillmode_us,
+        exact_us,
 #if 0 /* Not currently recognized */
         wrap_us,
         edgeclamp_us,