From 65ae304a24406cc58a199e63cb6d4a621b7a2507 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Tue, 30 Dec 2025 14:24:27 +0100
Subject: [PATCH 01/15] Restructured workflow

---
 .github/workflows/main.yml | 93 +++++++++++++++++++++++++-------------
 1 file changed, 62 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c577f58841..08855db299 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -8,7 +8,8 @@ on:
   release:
     types: [published]
 jobs:
-  build:
+  ##############################
+  build-linux:
     runs-on: ubuntu-24.04
     steps:
     # Note: some steps require the checkout in the root directory
@@ -40,12 +41,6 @@ jobs:
       with:
         comment_title: Compilation
         compile_result_file: build/make-output.txt
-    - name: Create debian package
-      if: ${{github.event_name == 'push' || github.event_name == 'release'}}
-      shell: bash
-      run: |
-        cd build
-        cpack
     - name: Run unit tests
       if: ${{github.event_name == 'push' || github.event_name == 'pull_request'}}
       shell: bash
@@ -59,12 +54,71 @@ jobs:
         junit_files: "gtest-regen.xml"
         action_fail: true
         action_fail_on_inconclusive: true
-    #####
+    - name: Upload test results
+      uses: actions/upload-artifact@v4
+      with:
+        name: test results
+        path: ./gtest-regen.xml
+  ##############################
+  debian-package:
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v3
+    - name: Create debian package
+      if: ${{github.event_name == 'push' || github.event_name == 'release'}}
+      shell: bash
+      run: |
+        sudo apt-get update -y -qq
+        sudo apt-get install libgl-dev \
+            libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
+            libboost-regex-dev libboost-timer-dev \
+            libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
+            libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
+            libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
+            libpng-dev libalut-dev \
+            qtbase5-dev libgtest-dev \
+            doxygen graphviz \
+            nlohmann-json3-dev
+        mkdir build
+        cd build
+        cmake ../ -DCMAKE_BUILD_TYPE=Release
+        cpack
+    - name: Release debian package
+      if: github.event_name == 'release'
+      shell: bash
+      env:
+        GITHUB_TOKEN: ${{ github.TOKEN }}
+      run: |
+        gh release upload ${{github.event.release.tag_name}} ./build/regen-*.deb
+    - name: Upload debian package
+      if: github.event_name == 'push'
+      uses: actions/upload-artifact@v4
+      with:
+        name: debian package
+        path: ./build/regen-*.deb
+  ##############################
+  doxygen:
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v3
     - name: Run doxygen
       if: ${{github.event_name == 'push' || github.event_name == 'release'}}
       shell: bash
       run: |
+        sudo apt-get update -y -qq
+        sudo apt-get install libgl-dev \
+            libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
+            libboost-regex-dev libboost-timer-dev \
+            libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
+            libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
+            libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
+            libpng-dev libalut-dev \
+            qtbase5-dev libgtest-dev \
+            doxygen graphviz \
+            nlohmann-json3-dev
+        mkdir build
         cd build
+        cmake ../ -DCMAKE_BUILD_TYPE=Release
         cmake --build . --target doc
         cp -r ../img regen/doc/html/
     - name: Extract version tag
@@ -88,26 +142,3 @@ jobs:
         BRANCH: gh-pages
         # The folder the action should deploy.
         FOLDER: build/regen/doc/html
-        # The folder in the target branch
-        TARGET_FOLDER: ${{ env.REGEN_DOCU_VERSION }}
-        CLEAN: true
-        SINGLE_COMMIT: true
-    #####
-    - name: Release debian package
-      if: github.event_name == 'release'
-      shell: bash
-      env:
-        GITHUB_TOKEN: ${{ github.TOKEN }}
-      run: |
-        gh release upload ${{github.event.release.tag_name}} ./build/regen-*.deb
-    - name: Upload debian package
-      if: github.event_name == 'push'
-      uses: actions/upload-artifact@v4
-      with:
-        name: debian package
-        path: ./build/regen-*.deb
-    - name: Upload test results
-      uses: actions/upload-artifact@v4
-      with:
-        name: test results
-        path: ./gtest-regen.xml

From 6b2718ba68519f9c06c13e703a734fd9b32b1e32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Tue, 30 Dec 2025 15:02:16 +0100
Subject: [PATCH 02/15] Added -j$(nproc)

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 08855db299..ab10c977e9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -31,7 +31,7 @@ jobs:
         mkdir build
         cd build
         cmake ../ -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DBUILD_VIDEO_PLAYER=ON
-        make 2> >(tee "make-output.txt")
+        make -j$(nproc) 2> >(tee "make-output.txt")
     - name: Annotate compilation warnings/errors
       if: ${{github.event_name == 'pull_request'}}
       uses: JacobDomagala/CompileResult@master

From 67057edc30c0cef32aaba94fd2dd43efeae0fbcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Tue, 30 Dec 2025 15:10:29 +0100
Subject: [PATCH 03/15] CI improvements

---
 .github/workflows/main.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ab10c977e9..2d67726030 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -17,6 +17,7 @@ jobs:
     - name: Build REGEN workspace
       shell: bash
       run: |
+        set -euo pipefail
         sudo apt-get update -y -qq
         sudo apt-get install libgl-dev \
             libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
@@ -61,6 +62,7 @@ jobs:
         path: ./gtest-regen.xml
   ##############################
   debian-package:
+    needs: build-linux
     runs-on: ubuntu-24.04
     steps:
     - uses: actions/checkout@v3
@@ -68,6 +70,7 @@ jobs:
       if: ${{github.event_name == 'push' || github.event_name == 'release'}}
       shell: bash
       run: |
+        set -euo pipefail
         sudo apt-get update -y -qq
         sudo apt-get install libgl-dev \
             libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
@@ -98,6 +101,7 @@ jobs:
         path: ./build/regen-*.deb
   ##############################
   doxygen:
+    needs: build-linux
     runs-on: ubuntu-24.04
     steps:
     - uses: actions/checkout@v3
@@ -105,6 +109,7 @@ jobs:
       if: ${{github.event_name == 'push' || github.event_name == 'release'}}
       shell: bash
       run: |
+        set -euo pipefail
         sudo apt-get update -y -qq
         sudo apt-get install libgl-dev \
             libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \

From ff4dd1398223661ff1efb6e4f140699b4f5f5d38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Tue, 30 Dec 2025 15:29:42 +0100
Subject: [PATCH 04/15] Do not use -mfpmath=sse with apple

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9284168bcf..bb7c009730 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,7 +75,7 @@ endif()
 # given in the books "Effective C++" and "More Effective C++"
 # add_definitions( -Weffc++ )
 
-if(UNIX) # gcc options
+if(UNIX AND NOT APPLE)
     add_definitions( -mfpmath=sse -march=native )
 endif()
 

From 5dd254f9fce76b3fcd6133f006a5268a8bee3eb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Wed, 31 Dec 2025 13:35:01 +0100
Subject: [PATCH 05/15] Added more badges

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 0880beb1db..57a0ebe785 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,11 @@
 </p>
 
 ![CI](https://github.com/daniel86/regen/workflows/CI/badge.svg)
+![Warnings](https://img.shields.io/badge/compiler%20warnings-clean-brightgreen)
+[![Docs](https://img.shields.io/badge/docs-online-blue)](https://daniel86.github.io/regen/)
+![Debian](https://img.shields.io/badge/debian-.deb%20package-blue)
+![GitHub release](https://img.shields.io/github/v/release/daniel86/regen?include_prereleases)
+![License](https://img.shields.io/github/license/daniel86/regen)
 
 `regen` -- **Real-time Graphics Engine** -- is a modular OpenGL-based C++ engine designed for research and experimentation in real-time rendering, GPU compute, and virtual world simulation.
 

From 8950bfef383dc662f276d45a75190f6a279613dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Wed, 31 Dec 2025 16:05:23 +0100
Subject: [PATCH 06/15] Added missing include

---
 regen/shader/includer.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/regen/shader/includer.h b/regen/shader/includer.h
index fc0ea12a88..5be3ec9a50 100644
--- a/regen/shader/includer.h
+++ b/regen/shader/includer.h
@@ -6,6 +6,7 @@
 #include <string>
 #include <map>
 #include <set>
+#include <list>
 
 namespace regen {
   /**

From b9c6132c46d1f2e41eeb28912738c1cbd5f7465d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Thu, 1 Jan 2026 14:06:21 +0100
Subject: [PATCH 07/15] Optional unit test build

---
 CMakeLists.txt | 50 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb7c009730..ad4f679dbf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,9 +75,9 @@ endif()
 # given in the books "Effective C++" and "More Effective C++"
 # add_definitions( -Weffc++ )
 
-if(UNIX AND NOT APPLE)
-    add_definitions( -mfpmath=sse -march=native )
-endif()
+# -march=native enables all instruction subsets supported by the local machine
+# e.g. SSE2, SSE3, SSE4, AVX, AVX2, etc.
+add_definitions( -march=native )
 
 # perform more aggressive floating-point optimizations
 # add_definitions( -ffast-math )
@@ -107,6 +107,10 @@ find_package(Boost ${Boost_MIN_VERSION}
 find_package(Threads REQUIRED)
 # Font library: text rendering support
 find_package(Freetype REQUIRED)
+# Note: On Linux, it seems needed in addition to link against brotlidec
+if (UNIX AND NOT APPLE)
+    set(FREETYPE_LIBRARIES ${FREETYPE_LIBRARIES} -lbrotlidec)
+endif()
 # JSON library: for serialization
 find_package(nlohmann_json REQUIRED)
 set(JSON_LIBRARIES nlohmann_json::nlohmann_json)
@@ -184,8 +188,10 @@ message(STATUS "  Models:${ASSIMP_INCLUDE_DIRS};")
 message(STATUS "  Fonts:${FREETYPE_INCLUDE_DIRS};")
 message(STATUS "  Physics:${BULLET_INCLUDE_DIRS};")
 
-enable_testing()
-find_package(GTest REQUIRED)
+if (BUILD_UNIT_TESTS)
+    enable_testing()
+    find_package(GTest REQUIRED)
+endif ()
 
 if (HAS_AV_LIBS)
     # allow includes without AL/ prefix. Some openAL versions require this.
@@ -199,7 +205,7 @@ set(REGEN_LIBRARIES
     ${Boost_LIBRARIES}
     ${JSON_LIBRARIES}
     ${ASSIMP_LIBRARIES}
-    ${FREETYPE_LIBRARIES} -lbrotlidec
+    ${FREETYPE_LIBRARIES}
     ${IMG_LIBRARIES}
     ${AV_LIBRARIES}
     ${CMAKE_THREAD_LIBS_INIT}
@@ -239,6 +245,10 @@ endif()
 ###########
 ###########
 
+if(REGEN_EXTRA_INCLUDE_DIRS)
+    include_directories(${REGEN_EXTRA_INCLUDE_DIRS})
+endif()
+
 # allow includes like <regen/xxx.h> even if the engine is not installed
 include_directories(.)
 # allow include of generated files
@@ -260,19 +270,21 @@ install(FILES img/icon-small.png DESTINATION ${SHARE_INSTALL_PATH}/img)
 ########## Unit Testings
 ##############
 
-# add an executable target for GTest.
-# but the testing code is partly in the *knowrob* library
-# where gtest won't find them without using the "--no-as-needed"
-# flag for the linker.
-add_executable(all_gtests
-        tests/gtests.cpp
-        tests/shapes/quad-tree-test.cpp)
-target_link_libraries(all_gtests
-        -Wl,--whole-archive,--no-as-needed
-        regen
-        -Wl,--no-whole-archive
-        ${Boost_Python_COMPONENT}
-        ${GTEST_MAIN_LIBRARIES})
+if (BUILD_UNIT_TESTS)
+    # add an executable target for GTest.
+    # but the testing code is partly in the *knowrob* library
+    # where gtest won't find them without using the "--no-as-needed"
+    # flag for the linker.
+    add_executable(all_gtests
+            tests/gtests.cpp
+            tests/shapes/quad-tree-test.cpp)
+    target_link_libraries(all_gtests
+            -Wl,--whole-archive,--no-as-needed
+            regen
+            -Wl,--no-whole-archive
+            ${Boost_Python_COMPONENT}
+            ${GTEST_MAIN_LIBRARIES})
+endif ()
 
 ##############
 ########## packaging

From ff545d3bd34b2f5963203da898b6872d6bf43a06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Thu, 1 Jan 2026 14:07:54 +0100
Subject: [PATCH 08/15] Migrated to SIMDe interface

---
 regen/compute/radix-sort-cpu.h |  18 +-
 regen/compute/simd.h           | 342 +++++++++++++++++----------------
 regen/compute/threading.h      |   6 +-
 3 files changed, 187 insertions(+), 179 deletions(-)

diff --git a/regen/compute/radix-sort-cpu.h b/regen/compute/radix-sort-cpu.h
index 3374b8ebe4..3ea2b0366f 100644
--- a/regen/compute/radix-sort-cpu.h
+++ b/regen/compute/radix-sort-cpu.h
@@ -139,23 +139,23 @@ namespace regen {
 					if constexpr (KEY_TYPE_BITS == 16) {
 						// Gather 8 keys manually, and promote to 32-bit
 						for (int k = 0; k < 8; ++k) tmpKeys32[k] = static_cast<int32_t>(keys[src[keyIdx+k]]);
-						r0 = _mm256_load_si256(reinterpret_cast<const __m256i*>(tmpKeys32));
-						r0 = _mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask);
+						r0 = simde_mm256_load_si256(reinterpret_cast<const simde__m256i*>(tmpKeys32));
+						r0 = simde_mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask);
 						keyIdx += 8; // processed 8 keys, not 16!
 					}
 					else if constexpr (KEY_TYPE_BITS == 32) {
-						simd::Register_i idx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[keyIdx]));
+						simd::Register_i idx = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(&src[keyIdx]));
 						// Gather 8 scattered keys, and apply shift and mask to get bucket ids
-						r0 = _mm256_i32gather_epi32(reinterpret_cast<const int*>(keys), idx, 4);
-						r0 = _mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask);
+						r0 = simde_mm256_i32gather_epi32(reinterpret_cast<const int*>(keys), idx, 4);
+						r0 = simde_mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask);
 						keyIdx += KEYS_PER_SIMD_PASS;
 					}
 					else if constexpr (KEY_TYPE_BITS == 64) {
 						// note: values have 32 bits, use __m128i to load only 4
-						__m128i idx32 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[keyIdx]));
+						simde__m128i idx32 = simde_mm_loadu_si128(reinterpret_cast<const simde__m128i*>(&src[keyIdx]));
 						// Gather 4 scattered keys, and apply shift and mask to get bucket ids
-						r0 = _mm256_i32gather_epi64(reinterpret_cast<const long long*>(keys), idx32, 8);
-						r0 = _mm256_and_si256(_mm256_srli_epi64(r0, SHIFT), mask);
+						r0 = simde_mm256_i32gather_epi64(reinterpret_cast<const long long*>(keys), idx32, 8);
+						r0 = simde_mm256_and_si256(simde_mm256_srli_epi64(r0, SHIFT), mask);
 						keyIdx += KEYS_PER_SIMD_PASS;
 					}
 					else {
@@ -163,7 +163,7 @@ namespace regen {
 						break;
 					}
 					// Store results into tmpBins_ and increment histogram
-					_mm256_storeu_si256(reinterpret_cast<__m256i*>(tmpBins_), r0);
+					simde_mm256_storeu_si256(reinterpret_cast<simde__m256i*>(tmpBins_), r0);
 					for (auto x : tmpBins_) ++histogram_[x];
 				}
 			}
diff --git a/regen/compute/simd.h b/regen/compute/simd.h
index b47af9a8e8..ef16cbac07 100644
--- a/regen/compute/simd.h
+++ b/regen/compute/simd.h
@@ -4,21 +4,31 @@
 #include <regen/compute/vector.h>
 #include <regen/memory/aligned-allocator.h>
 
-// NOTE: Check for REGEN_HAS_SIMD, if it is not defined, the SIMD operations will be disabled
-//       and the code here will fall back to scalar operations.
-// NOLINTBEGIN(portability-simd-intrinsics)
-#if defined(__AVX__)
-	#include <immintrin.h> // AVX
-	#define REGEN_SIMD_MODE AVX
-	#define REGEN_SIMD_WIDTH 8
-	#define REGEN_HAS_SIMD
-#elif defined(__SSE__)
-	#include <xmmintrin.h> // SSE
-	#define REGEN_SIMD_MODE SSE
-	#define REGEN_SIMD_WIDTH 4
-	#define REGEN_HAS_SIMD
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/simde-common.h>
+#include <simde/x86/avx.h>
+#include <simde/x86/avx2.h>
+#include <simde/x86/fma.h>
+#include <simde/x86/sse.h>
+#include <simde/x86/sse2.h>
+
+#define REGEN_SIMD_NONE 0
+#define REGEN_SIMD_SSE 1
+#define REGEN_SIMD_AVX 2
+
+#if defined(SIMDE_NATURAL_VECTOR_SIZE)
+	#if SIMDE_NATURAL_VECTOR_SIZE >= 32
+		#define REGEN_SIMD_MODE  REGEN_SIMD_AVX
+		#define REGEN_SIMD_WIDTH 8
+	#elif SIMDE_NATURAL_VECTOR_SIZE >= 16
+		#define REGEN_SIMD_MODE  REGEN_SIMD_SSE
+		#define REGEN_SIMD_WIDTH 4
+	#else
+		#define REGEN_SIMD_MODE  REGEN_SIMD_NONE
+		#define REGEN_SIMD_WIDTH 1
+	#endif
 #else
-	#define REGEN_SIMD_MODE NONE
+	#define REGEN_SIMD_MODE  REGEN_SIMD_NONE
 	#define REGEN_SIMD_WIDTH 1
 #endif
 
@@ -32,236 +42,236 @@ namespace regen::simd {
 		return bitIndex;
 	}
 
-#if REGEN_SIMD_MODE == AVX
+#if REGEN_SIMD_MODE == REGEN_SIMD_AVX
 	static constexpr int8_t RegisterMask = 0xFF; // 8 bits for AVX
-	using Register = __m256; // 8 floats
-	using Register_i = __m256i; // 8 integers
+	using Register = simde__m256; // 8 floats
+	using Register_i = simde__m256i; // 8 integers
 
-	inline __m256 set1_ps(float v) { return _mm256_set1_ps(v); }
-	inline __m256i set1_epi32(int32_t v) { return _mm256_set1_epi32(v); }
-	inline __m256i set1_epi16(uint16_t v) { return _mm256_set1_epi16(v); }
-	inline __m256i set1_epi64(int64_t v) { return _mm256_set1_epi64x(v); }
-	inline __m256i set1_epi64u(uint64_t v) { return _mm256_set1_epi64x(v); }
+	inline Register set1_ps(float v) { return simde_mm256_set1_ps(v); }
+	inline Register_i set1_epi32(int32_t v) { return simde_mm256_set1_epi32(v); }
+	inline Register_i set1_epi16(uint16_t v) { return simde_mm256_set1_epi16(v); }
+	inline Register_i set1_epi64(int64_t v) { return simde_mm256_set1_epi64x(v); }
+	inline Register_i set1_epi64u(uint64_t v) { return simde_mm256_set1_epi64x(v); }
 
-	inline __m256 setzero_ps() { return _mm256_setzero_ps(); }
-	inline __m256i setzero_si256() { return _mm256_setzero_si256(); }
+	inline Register setzero_ps() { return simde_mm256_setzero_ps(); }
+	inline Register_i setzero_si256() { return simde_mm256_setzero_si256(); }
 
-	inline __m256 load_ps(const float *p) { return _mm256_load_ps(p); }
-	inline __m256 loadu_ps(const float *p) { return _mm256_loadu_ps(p); }
+	inline Register load_ps(const float *p) { return simde_mm256_load_ps(p); }
+	inline Register loadu_ps(const float *p) { return simde_mm256_loadu_ps(p); }
 
-	inline __m256i load_si256(const uint16_t *p) {
-		return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i load_si256(const uint16_t *p) {
+		return simde_mm256_load_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i load_si256(const uint32_t *p) {
-		return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i load_si256(const uint32_t *p) {
+		return simde_mm256_load_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i load_si256(const uint64_t *p) {
-		return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i load_si256(const uint64_t *p) {
+		return simde_mm256_load_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i load_si256(const int32_t *p) {
-		return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i load_si256(const int32_t *p) {
+		return simde_mm256_load_si256(reinterpret_cast<const Register_i*>(p));
 	}
 
-	inline __m256i loadu_si256(const uint16_t *p) {
-		return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i loadu_si256(const uint16_t *p) {
+		return simde_mm256_loadu_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i loadu_si256(const uint32_t *p) {
-		return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i loadu_si256(const uint32_t *p) {
+		return simde_mm256_loadu_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i loadu_si256(const uint64_t *p) {
-		return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i loadu_si256(const uint64_t *p) {
+		return simde_mm256_loadu_si256(reinterpret_cast<const Register_i*>(p));
 	}
-	inline __m256i loadu_si256(const int32_t *p) {
-		return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+	inline Register_i loadu_si256(const int32_t *p) {
+		return simde_mm256_loadu_si256(reinterpret_cast<const Register_i*>(p));
 	}
 
-	inline __m256 epi_to_ps(const __m256i &v) { return _mm256_castsi256_ps(v); }
+	inline Register epi_to_ps(const Register_i &v) { return simde_mm256_castsi256_ps(v); }
 
-	inline __m256 i32gather_ps(const float *p, const __m256i &indices) {
-		return _mm256_i32gather_ps(p, indices, sizeof(float));
+	inline Register i32gather_ps(const float *p, const Register_i &indices) {
+		return simde_mm256_i32gather_ps(p, indices, sizeof(float));
 	}
 
-	inline void storeu_ps(float *p, const __m256 &v) { _mm256_storeu_ps(p, v); }
-	inline void store_ps(float *p, const __m256 &v) { _mm256_store_ps(p, v); }
+	inline void storeu_ps(float *p, const Register &v) { simde_mm256_storeu_ps(p, v); }
+	inline void store_ps(float *p, const Register &v) { simde_mm256_store_ps(p, v); }
 
-	inline void storeu_epi32(int32_t *p, const __m256i &v) {
-		_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v);
+	inline void storeu_epi32(int32_t *p, const Register_i &v) {
+		simde_mm256_storeu_si256(reinterpret_cast<Register_i*>(p), v);
 	}
-	inline void storeu_epi32(uint32_t *p, const __m256i &v) {
-		_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v);
+	inline void storeu_epi32(uint32_t *p, const Register_i &v) {
+		simde_mm256_storeu_si256(reinterpret_cast<Register_i*>(p), v);
 	}
-	inline void store_epi32(int32_t *p, const __m256i &v) {
-		_mm256_store_si256(reinterpret_cast<__m256i*>(p), v);
+	inline void store_epi32(int32_t *p, const Register_i &v) {
+		simde_mm256_store_si256(reinterpret_cast<Register_i*>(p), v);
 	}
-	inline void store_epi32(uint32_t *p, const __m256i &v) {
-		_mm256_store_si256(reinterpret_cast<__m256i*>(p), v);
+	inline void store_epi32(uint32_t *p, const Register_i &v) {
+		simde_mm256_store_si256(reinterpret_cast<Register_i*>(p), v);
 	}
 
-	inline __m256 add_ps(const __m256 &a, const __m256 &b) { return _mm256_add_ps(a, b); }
-	inline __m256 sub_ps(const __m256 &a, const __m256 &b) { return _mm256_sub_ps(a, b); }
-	inline __m256 mul_ps(const __m256 &a, const __m256 &b) { return _mm256_mul_ps(a, b); }
-	inline __m256 div_ps(const __m256 &a, const __m256 &b) { return _mm256_div_ps(a, b); }
+	inline Register add_ps(const Register &a, const Register &b) { return simde_mm256_add_ps(a, b); }
+	inline Register sub_ps(const Register &a, const Register &b) { return simde_mm256_sub_ps(a, b); }
+	inline Register mul_ps(const Register &a, const Register &b) { return simde_mm256_mul_ps(a, b); }
+	inline Register div_ps(const Register &a, const Register &b) { return simde_mm256_div_ps(a, b); }
 
 	/**
 	 * Fused multiply-add: (a * b) + c
 	 */
-	inline __m256 mul_add_ps(const __m256 &a, const __m256 &b, const __m256 &c) {
-		return _mm256_fmadd_ps(a, b, c);
+	inline Register mul_add_ps(const Register &a, const Register &b, const Register &c) {
+		return simde_mm256_fmadd_ps(a, b, c);
 	}
 
-	inline __m256i add_epi32(const __m256i &a, const __m256i &b) { return _mm256_add_epi32(a, b); }
-	inline __m256i sub_epi32(const __m256i &a, const __m256i &b) { return _mm256_sub_epi32(a, b); }
-	inline __m256i mul_epi32(const __m256i &a, const __m256i &b) { return _mm256_mullo_epi32(a, b); }
+	inline Register_i add_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_add_epi32(a, b); }
+	inline Register_i sub_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_sub_epi32(a, b); }
+	inline Register_i mul_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_mullo_epi32(a, b); }
 
-	inline __m256 min_ps(const __m256 &a, const __m256 &b) { return _mm256_min_ps(a, b); }
-	inline __m256 max_ps(const __m256 &a, const __m256 &b) { return _mm256_max_ps(a, b); }
-	inline __m256 sqrt_ps(const __m256 &a) { return _mm256_sqrt_ps(a); }
+	inline Register min_ps(const Register &a, const Register &b) { return simde_mm256_min_ps(a, b); }
+	inline Register max_ps(const Register &a, const Register &b) { return simde_mm256_max_ps(a, b); }
+	inline Register sqrt_ps(const Register &a) { return simde_mm256_sqrt_ps(a); }
 
-	inline __m256i min_epi32(const __m256i &a, const __m256i &b) { return _mm256_min_epi32(a, b); }
-	inline __m256i max_epi32(const __m256i &a, const __m256i &b) { return _mm256_max_epi32(a, b); }
+	inline Register_i min_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_min_epi32(a, b); }
+	inline Register_i max_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_max_epi32(a, b); }
 
 	/**
-	 * Horizontal sum of all elements in an __m256
+	 * Horizontal sum of all elements in an Register
 	 */
-	inline float hsum_ps(__m256 v) {
-		__m128 vlow  = _mm256_castps256_ps128(v);        // low 128
-		__m128 vhigh = _mm256_extractf128_ps(v, 1);   // high 128
-		__m128 sum   = _mm_add_ps(vlow, vhigh);       // add low and high parts
-		__m128 shuf  = _mm_movehdup_ps(sum);             // (sum.y, sum.y, sum.w, sum.w)
-		__m128 sums  = _mm_add_ps(sum, shuf);
-		shuf         = _mm_movehl_ps(shuf, sums);     // high half of sums
-		sums         = _mm_add_ss(sums, shuf);
-		return _mm_cvtss_f32(sums);
+	inline float hsum_ps(Register v) {
+		simde__m128 vlow  = simde_mm256_castps256_ps128(v);        // low 128
+		simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1);   // high 128
+		simde__m128 sum   = simde_mm_add_ps(vlow, vhigh);       // add low and high parts
+		simde__m128 shuf  = simde_mm_movehdup_ps(sum);             // (sum.y, sum.y, sum.w, sum.w)
+		simde__m128 sums  = simde_mm_add_ps(sum, shuf);
+		shuf         = simde_mm_movehl_ps(shuf, sums);     // high half of sums
+		sums         = simde_mm_add_ss(sums, shuf);
+		return simde_mm_cvtss_f32(sums);
 	}
 
 	/**
-	 * Horizontal min of all elements in an __m256
+	 * Horizontal min of all elements in an Register
 	 */
-	inline float hmin_ps(__m256 v) {
-		__m128 vlow  = _mm256_castps256_ps128(v);        // low 128
-		__m128 vhigh = _mm256_extractf128_ps(v, 1);   // high 128
-		__m128 min   = _mm_min_ps(vlow, vhigh);       // min low and high parts
-		__m128 shuf  = _mm_movehdup_ps(min);             // (min.y, min.y, min.w, min.w)
-		__m128 mins  = _mm_min_ps(min, shuf);
-		shuf         = _mm_movehl_ps(shuf, mins);     // high half of mins
-		mins         = _mm_min_ss(mins, shuf);
-		return _mm_cvtss_f32(mins);
+	inline float hmin_ps(Register v) {
+		simde__m128 vlow  = simde_mm256_castps256_ps128(v);        // low 128
+		simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1);   // high 128
+		simde__m128 min   = simde_mm_min_ps(vlow, vhigh);       // min low and high parts
+		simde__m128 shuf  = simde_mm_movehdup_ps(min);             // (min.y, min.y, min.w, min.w)
+		simde__m128 mins  = simde_mm_min_ps(min, shuf);
+		shuf         = simde_mm_movehl_ps(shuf, mins);     // high half of mins
+		mins         = simde_mm_min_ss(mins, shuf);
+		return simde_mm_cvtss_f32(mins);
 	}
 
 	/**
-	 * Horizontal max of all elements in an __m256
+	 * Horizontal max of all elements in an Register
 	 */
-	inline float hmax_ps(__m256 v) {
-		__m128 vlow  = _mm256_castps256_ps128(v);        // low 128
-		__m128 vhigh = _mm256_extractf128_ps(v, 1);   // high 128
-		__m128 max   = _mm_max_ps(vlow, vhigh);       // max low and high parts
-		__m128 shuf  = _mm_movehdup_ps(max);             // (max.y, max.y, max.w, max.w)
-		__m128 maxs  = _mm_max_ps(max, shuf);
-		shuf         = _mm_movehl_ps(shuf, maxs);     // high half of maxs
-		maxs         = _mm_max_ss(maxs, shuf);
-		return _mm_cvtss_f32(maxs);
+	inline float hmax_ps(Register v) {
+		simde__m128 vlow  = simde_mm256_castps256_ps128(v);        // low 128
+		simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1);   // high 128
+		simde__m128 max   = simde_mm_max_ps(vlow, vhigh);       // max low and high parts
+		simde__m128 shuf  = simde_mm_movehdup_ps(max);             // (max.y, max.y, max.w, max.w)
+		simde__m128 maxs  = simde_mm_max_ps(max, shuf);
+		shuf         = simde_mm_movehl_ps(shuf, maxs);     // high half of maxs
+		maxs         = simde_mm_max_ss(maxs, shuf);
+		return simde_mm_cvtss_f32(maxs);
 	}
 
-	inline __m256 rcp_ps(const __m256 &a) { return _mm256_rcp_ps(a); }
+	inline Register rcp_ps(const Register &a) { return simde_mm256_rcp_ps(a); }
 
-	inline __m256 cmp_lt(const __m256 &a, const __m256 &b) {
-		return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
+	inline Register cmp_lt(const Register &a, const Register &b) {
+		return simde_mm256_cmp_ps(a, b, _CMP_LT_OQ);
 	}
-	inline __m256 cmp_gt(const __m256 &a, const __m256 &b) {
-		return _mm256_cmp_ps(a, b, _CMP_GT_OQ);
+	inline Register cmp_gt(const Register &a, const Register &b) {
+		return simde_mm256_cmp_ps(a, b, _CMP_GT_OQ);
 	}
-	inline __m256 cmp_eq(const __m256 &a, const __m256 &b) {
-		return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
+	inline Register cmp_eq(const Register &a, const Register &b) {
+		return simde_mm256_cmp_ps(a, b, _CMP_EQ_OQ);
 	}
-	inline __m256 cmp_neq(const __m256 &a, const __m256 &b) {
-		return _mm256_cmp_ps(a, b, _CMP_NEQ_OQ);
+	inline Register cmp_neq(const Register &a, const Register &b) {
+		return simde_mm256_cmp_ps(a, b, _CMP_NEQ_OQ);
 	}
-	inline __m256 cmp_or(const __m256 &a, const __m256 &b) {
-		return _mm256_or_ps(a, b);
+	inline Register cmp_or(const Register &a, const Register &b) {
+		return simde_mm256_or_ps(a, b);
 	}
-	inline __m256 cmp_and(const __m256 &a, const __m256 &b) {
-		return _mm256_and_ps(a, b);
+	inline Register cmp_and(const Register &a, const Register &b) {
+		return simde_mm256_and_ps(a, b);
 	}
-	inline __m256 cmp_and_not(const __m256 &a, const __m256 &b) {
-		return _mm256_andnot_ps(a, b);
+	inline Register cmp_and_not(const Register &a, const Register &b) {
+		return simde_mm256_andnot_ps(a, b);
 	}
 
-	inline __m256i cvttps_epi32(const __m256 &a) { return _mm256_cvttps_epi32(a); }
+	inline Register_i cvttps_epi32(const Register &a) { return simde_mm256_cvttps_epi32(a); }
 
-	inline int movemask_ps(const __m256 &v) { return _mm256_movemask_ps(v); }
+	inline int movemask_ps(const Register &v) { return simde_mm256_movemask_ps(v); }
 
-	inline __m256 blendv_ps(const __m256 &a, const __m256 &b, const __m256 &mask) {
-		return _mm256_blendv_ps(a, b, mask);
+	inline Register blendv_ps(const Register &a, const Register &b, const Register &mask) {
+		return simde_mm256_blendv_ps(a, b, mask);
 	}
 
-#elif REGEN_SIMD_MODE == SSE
+#elif REGEN_SIMD_MODE == REGEN_SIMD_SSE
 	static constexpr int8_t RegisterMask = 0x0F; // 4 bits for SSE
 	using Register = __m128; // 4 floats
 	using Register_i = __m128i; // 4 integers
 
-	inline __m128 set1_ps(float v) { return _mm_set1_ps(v); }
-	inline __m128i set1_epi32(int32_t v) { return _mm_set1_epi32(v); }
+	inline Register set1_ps(float v) { return simde_mm_set1_ps(v); }
+	inline Register_i set1_epi32(int32_t v) { return simde_mm_set1_epi32(v); }
 
-	inline __m128 setzero_ps() { return _mm_setzero_ps(); }
-	inline __m128i setzero_si256() { return _mm_setzero_si128(); }
+	inline Register setzero_ps() { return simde_mm_setzero_ps(); }
+	inline Register_i setzero_si256() { return simde_mm_setzero_si128(); }
 
-	inline __m128 load_ps(const float *p) { return _mm_load_ps(p); }
-	inline __m128 loadu_ps(const float *p) { return _mm_loadu_ps(p); }
+	inline Register load_ps(const float *p) { return simde_mm_load_ps(p); }
+	inline Register loadu_ps(const float *p) { return simde_mm_loadu_ps(p); }
 
-	inline __m128i loadu_si256(const uint32_t *p) {
-		return _mm_loadu_si128(reinterpret_cast<const __m128i*>(indices));
+	inline Register_i loadu_si256(const uint32_t *p) {
+		return simde_mm_loadu_si128(reinterpret_cast<const Register_i*>(indices));
 	}
 
-	inline __m128 epi_to_ps(const __m128i &v) { return _mm_castsi128_ps(v); }
+	inline Register epi_to_ps(const Register_i &v) { return simde_mm_castsi128_ps(v); }
 
-	inline __m128 i32gather_ps(const float *p, const __m128i &indices) {
-		return _mm_i32gather_ps(p, indices, sizeof(float));
+	inline Register i32gather_ps(const float *p, const Register_i &indices) {
+		return simde_mm_i32gather_ps(p, indices, sizeof(float));
 	}
 
-	inline void storeu_ps(float *p, const __m128 &v) { _mm_storeu_ps(p, v); }
+	inline void storeu_ps(float *p, const Register &v) { simde_mm_storeu_ps(p, v); }
 
-	inline __m128 add_ps(const __m128 &a, const __m128 &b) { return _mm_add_ps(a, b); }
-	inline __m128 sub_ps(const __m128 &a, const __m128 &b) { return _mm_sub_ps(a, b); }
-	inline __m128 mul_ps(const __m128 &a, const __m128 &b) { return _mm_mul_ps(a, b); }
-	inline __m128 div_ps(const __m128 &a, const __m128 &b) { return _mm_div_ps(a, b); }
+	inline Register add_ps(const Register &a, const Register &b) { return simde_mm_add_ps(a, b); }
+	inline Register sub_ps(const Register &a, const Register &b) { return simde_mm_sub_ps(a, b); }
+	inline Register mul_ps(const Register &a, const Register &b) { return simde_mm_mul_ps(a, b); }
+	inline Register div_ps(const Register &a, const Register &b) { return simde_mm_div_ps(a, b); }
 
-	inline __m128i add_epi32(const __m128i &a, const __m128i &b) { return _mm_add_epi32(a, b); }
-	inline __m128i sub_epi32(const __m128i &a, const __m128i &b) { return _mm_sub_epi32(a, b); }
-	inline __m128i mul_epi32(const __m128i &a, const __m128i &b) { return _mm_mullo_epi32(a, b); }
+	inline Register_i add_epi32(const Register_i &a, const Register_i &b) { return simde_mm_add_epi32(a, b); }
+	inline Register_i sub_epi32(const Register_i &a, const Register_i &b) { return simde_mm_sub_epi32(a, b); }
+	inline Register_i mul_epi32(const Register_i &a, const Register_i &b) { return simde_mm_mullo_epi32(a, b); }
 
-	inline __m128 min_ps(const __m128 &a, const __m128 &b) { return _mm_min_ps(a, b); }
-	inline __m128 max_ps(const __m128 &a, const __m128 &b) { return _mm_max_ps(a, b); }
-	inline __m128 sqrt_ps(const __m128 &a) { return _mm_sqrt_ps(a); }
+	inline Register min_ps(const Register &a, const Register &b) { return simde_mm_min_ps(a, b); }
+	inline Register max_ps(const Register &a, const Register &b) { return simde_mm_max_ps(a, b); }
+	inline Register sqrt_ps(const Register &a) { return simde_mm_sqrt_ps(a); }
 
-	inline __m128i min_epi32(const __m128i &a, const __m128i &b) { return _mm_min_epi32(a, b); }
-	inline __m128i max_epi32(const __m128i &a, const __m128i &b) { return _mm_max_epi32(a, b); }
+	inline Register_i min_epi32(const Register_i &a, const Register_i &b) { return simde_mm_min_epi32(a, b); }
+	inline Register_i max_epi32(const Register_i &a, const Register_i &b) { return simde_mm_max_epi32(a, b); }
 
-	inline float hsum_ps(__m128 v) {
-		__m128 shuf = _mm_movehdup_ps(v);  // (v1, v1, v3, v3)
-		__m128 sums = _mm_add_ps(v, shuf);
-		shuf = _mm_movehl_ps(shuf, sums); // (v2 + v3, v3, -, -)
-		sums = _mm_add_ss(sums, shuf);
-		return _mm_cvtss_f32(sums);
+	inline float hsum_ps(Register v) {
+		Register shuf = simde_mm_movehdup_ps(v);  // (v1, v1, v3, v3)
+		Register sums = simde_mm_add_ps(v, shuf);
+		shuf = simde_mm_movehl_ps(shuf, sums); // (v2 + v3, v3, -, -)
+		sums = simde_mm_add_ss(sums, shuf);
+		return simde_mm_cvtss_f32(sums);
 	}
 
-	inline __m128 rcp_ps(const __m128 &a) { return _mm_rcp_ps(a); }
+	inline Register rcp_ps(const Register &a) { return simde_mm_rcp_ps(a); }
 
-	inline __m128 cmp_lt(const __m128 &a, const __m128 &b)  { return _mm_cmplt_ps(a, b); }
-	inline __m128 cmp_gt(const __m128 &a, const __m128 &b)  { return _mm_cmplt_ps(b, a); }
-	inline __m128 cmp_eq(const __m128 &a, const __m128 &b)  { return _mm_cmpeq_ps(a, b); }
-	inline __m128 cmp_neq(const __m128 &a, const __m128 &b) {
-		__m128 eq = _mm_cmpeq_ps(a, b);
-		return _mm_andnot_ps(eq, _mm_castsi128_ps(_mm_set1_epi32(-1)));  // ~eq & all_ones
+	inline Register cmp_lt(const Register &a, const Register &b)  { return simde_mm_cmplt_ps(a, b); }
+	inline Register cmp_gt(const Register &a, const Register &b)  { return simde_mm_cmplt_ps(b, a); }
+	inline Register cmp_eq(const Register &a, const Register &b)  { return simde_mm_cmpeq_ps(a, b); }
+	inline Register cmp_neq(const Register &a, const Register &b) {
+		Register eq = simde_mm_cmpeq_ps(a, b);
+		return simde_mm_andnot_ps(eq, simde_mm_castsi128_ps(simde_mm_set1_epi32(-1)));  // ~eq & all_ones
 	}
-	inline __m128 cmp_or(const __m128 &a, const __m128 &b) { return _mm_or_ps(a, b); }
-	inline __m128 cmp_and(const __m128 &a, const __m128 &b) { return _mm_and_ps(a, b); }
+	inline Register cmp_or(const Register &a, const Register &b) { return simde_mm_or_ps(a, b); }
+	inline Register cmp_and(const Register &a, const Register &b) { return simde_mm_and_ps(a, b); }
 
-	inline __m128i cvttps_epi32(const __m128 &a) { return _mm_cvttps_epi32(a); }
+	inline Register_i cvttps_epi32(const Register &a) { return simde_mm_cvttps_epi32(a); }
 
-	inline int movemask_ps(const __m128 &v) { return _mm_movemask_ps(v); }
+	inline int movemask_ps(const Register &v) { return simde_mm_movemask_ps(v); }
 
-	inline __m128 blendv_ps(const __m128 &a, const __m128 &b, const __m128 &mask) {
-		return _mm_blendv_ps(a, b, mask);
+	inline Register blendv_ps(const Register &a, const Register &b, const Register &mask) {
+		return simde_mm_blendv_ps(a, b, mask);
 	}
 
 #else // Fallback to scalar operations
@@ -752,7 +762,7 @@ namespace regen {
 		}
 
 		static BatchOf_int32 castFloatBatch(const BatchOf_float &v) {
-			return BatchOf_int32{_mm256_castps_si256(v.c)};
+			return BatchOf_int32{simde_mm256_castps_si256(v.c)};
 		}
 
 		/**
@@ -832,7 +842,7 @@ namespace regen {
 		}
 
 		BatchOf_int32 operator&(const BatchOf_int32 &other) const {
-			return BatchOf_int32{_mm256_and_si256(c, other.c)};
+			return BatchOf_int32{simde_mm256_and_si256(c, other.c)};
 		}
 
 		static BatchOf_int32 allZeros() {
@@ -1199,7 +1209,7 @@ namespace regen {
 
 		/**
 		 * Computes the length squared of each vector in the batch.
-		 * @return __m128 containing the length squared for each vector.
+		 * @return Register containing the length squared for each vector.
 		 */
 		BatchOf_float lengthSquared() const {
 			return x*x + y*y + z*z;
@@ -1272,6 +1282,4 @@ namespace regen {
 	template<typename T> using vectorSIMD = std::vector<T, AlignedAllocator<T, 32>>;
 }
 
-// NOLINTEND(portability-simd-intrinsics)
-
 #endif /* REGEN_SIMD_H_ */
diff --git a/regen/compute/threading.h b/regen/compute/threading.h
index 250905e299..9a2d5c2082 100644
--- a/regen/compute/threading.h
+++ b/regen/compute/threading.h
@@ -9,8 +9,8 @@
 #include "regen/memory/aligned-allocator.h"
 
 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
-    #include <immintrin.h>
-    #define CPU_PAUSE() _mm_pause()
+    #include <regen/compute/simd.h>
+    #define CPU_PAUSE() simde_mm_pause()
 #elif defined(__aarch64__) || defined(__arm__)
     #define CPU_PAUSE() asm volatile("yield" ::: "memory")
 #else
@@ -373,7 +373,7 @@ namespace regen {
 #if 0
 			int spins = 0;
 			while (numJobsRemaining_.load(std::memory_order_acquire) > 0u) {
-				if (++spins < 1000) _mm_pause();
+				if (++spins < 1000) simde_mm_pause();
 				else std::this_thread::yield();
 			}
 #else

From f8b2446061f995a58ccb0d778df354821ecaa41e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Thu, 1 Jan 2026 14:08:11 +0100
Subject: [PATCH 09/15] Added macos build

---
 .github/workflows/main.yml | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 2d67726030..aad0b6884f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -12,7 +12,6 @@ jobs:
   build-linux:
     runs-on: ubuntu-24.04
     steps:
-    # Note: some steps require the checkout in the root directory
     - uses: actions/checkout@v3
     - name: Build REGEN workspace
       shell: bash
@@ -31,13 +30,16 @@ jobs:
             nlohmann-json3-dev
         mkdir build
         cd build
-        cmake ../ -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DBUILD_VIDEO_PLAYER=ON
+        cmake ../ \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DBUILD_UNIT_TESTS=ON \
+          -DBUILD_TESTS=ON \
+          -DBUILD_VIDEO_PLAYER=ON
         make -j$(nproc) 2> >(tee "make-output.txt")
     - name: Annotate compilation warnings/errors
       if: ${{github.event_name == 'pull_request'}}
       uses: JacobDomagala/CompileResult@master
       # just so that in case this step fails, the workflow doesn't stop.
-      # this is done as it is unclear how well the action is maintained.
       continue-on-error: true
       with:
         comment_title: Compilation
@@ -61,6 +63,25 @@ jobs:
         name: test results
         path: ./gtest-regen.xml
   ##############################
+  build-macos:
+    runs-on: macos-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install dependencies
+        run: brew install cmake gcc simde boost assimp openal-soft freetype bullet \
+            glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest
+      - name: Build REGEN workspace
+        run: |
+          mkdir build
+          cd build
+          cmake \
+            -DCMAKE_C_COMPILER=/opt/homebrew/bin/gcc-14 -DCMAKE_CXX_COMPILER=/opt/homebrew/bin/g++-14 \
+            -DCMAKE_PREFIX_PATH="/opt/homebrew/opt/qt@5" \
+            -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON \
+            -DREGEN_EXTRA_INCLUDE_DIRS="/opt/homebrew/opt/openal-soft/include/" \
+            ../
+          make
+  ##############################
   debian-package:
     needs: build-linux
     runs-on: ubuntu-24.04

From e876c04ca8760538a0b50c5698e613f9d47a9620 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Thu, 1 Jan 2026 14:09:25 +0100
Subject: [PATCH 10/15] Added macos build

---
 .github/workflows/main.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index aad0b6884f..e6b5314ab2 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -68,8 +68,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - name: Install dependencies
-        run: brew install cmake gcc simde boost assimp openal-soft freetype bullet \
-            glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest
+        run: brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest
       - name: Build REGEN workspace
         run: |
           mkdir build

From 3ad96dd3fd0ed8ec9c89cd73e11407b396ab1286 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Thu, 1 Jan 2026 14:14:51 +0100
Subject: [PATCH 11/15] Added simde dependency

---
 .github/workflows/main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index e6b5314ab2..09cd378dc9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -20,7 +20,7 @@ jobs:
         sudo apt-get update -y -qq
         sudo apt-get install libgl-dev \
             libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
-            libboost-regex-dev libboost-timer-dev \
+            libboost-regex-dev libboost-timer-dev libsimde-dev \
             libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
             libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
             libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
@@ -94,7 +94,7 @@ jobs:
         sudo apt-get update -y -qq
         sudo apt-get install libgl-dev \
             libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
-            libboost-regex-dev libboost-timer-dev \
+            libboost-regex-dev libboost-timer-dev libsimde-dev \
             libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
             libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
             libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
@@ -133,7 +133,7 @@ jobs:
         sudo apt-get update -y -qq
         sudo apt-get install libgl-dev \
             libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
-            libboost-regex-dev libboost-timer-dev \
+            libboost-regex-dev libboost-timer-dev libsimde-dev \
             libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
             libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
             libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \

From 3647bc682edb2ef6e0f99088182285b8899b662f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Thu, 1 Jan 2026 14:32:20 +0100
Subject: [PATCH 12/15] Refactored MacOS CI into its own workflow

---
 .github/workflows/{main.yml => ci-linux.yml} | 21 +-------------
 .github/workflows/ci-macos.yml               | 30 ++++++++++++++++++++
 README.md                                    |  3 +-
 3 files changed, 33 insertions(+), 21 deletions(-)
 rename .github/workflows/{main.yml => ci-linux.yml} (88%)
 create mode 100644 .github/workflows/ci-macos.yml

diff --git a/.github/workflows/main.yml b/.github/workflows/ci-linux.yml
similarity index 88%
rename from .github/workflows/main.yml
rename to .github/workflows/ci-linux.yml
index 09cd378dc9..8b57a63eb7 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/ci-linux.yml
@@ -1,5 +1,4 @@
-
-name: CI
+name: CI Linux
 on:
   push:
     branches: [ dev ]
@@ -63,24 +62,6 @@ jobs:
         name: test results
         path: ./gtest-regen.xml
   ##############################
-  build-macos:
-    runs-on: macos-latest
-    steps:
-      - uses: actions/checkout@v3
-      - name: Install dependencies
-        run: brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest
-      - name: Build REGEN workspace
-        run: |
-          mkdir build
-          cd build
-          cmake \
-            -DCMAKE_C_COMPILER=/opt/homebrew/bin/gcc-14 -DCMAKE_CXX_COMPILER=/opt/homebrew/bin/g++-14 \
-            -DCMAKE_PREFIX_PATH="/opt/homebrew/opt/qt@5" \
-            -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON \
-            -DREGEN_EXTRA_INCLUDE_DIRS="/opt/homebrew/opt/openal-soft/include/" \
-            ../
-          make
-  ##############################
   debian-package:
     needs: build-linux
     runs-on: ubuntu-24.04
diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml
new file mode 100644
index 0000000000..b719cc780d
--- /dev/null
+++ b/.github/workflows/ci-macos.yml
@@ -0,0 +1,30 @@
+name: CI MacOS
+on:
+  push:
+    branches: [ dev ]
+  pull_request:
+    branches: [ dev ]
+  release:
+    types: [published]
+
+jobs:
+  build-macos:
+    runs-on: macos-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install dependencies
+        run: |
+          brew update \
+          brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest
+      - name: Build REGEN workspace
+        run: |
+          mkdir build
+          cd build
+          cmake \
+            -DCMAKE_C_COMPILER=/opt/homebrew/bin/gcc-14 -DCMAKE_CXX_COMPILER=/opt/homebrew/bin/g++-14 \
+            -DCMAKE_PREFIX_PATH="/opt/homebrew/opt/qt@5" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DBUILD_TESTS=ON \
+            -DREGEN_EXTRA_INCLUDE_DIRS="/opt/homebrew/opt/openal-soft/include/" \
+            ../
+          make
diff --git a/README.md b/README.md
index 57a0ebe785..6ba1d8645b 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,8 @@
     <img src="img/gallery.gif" width="160" height="100" />
 </p>
 
-![CI](https://github.com/daniel86/regen/workflows/CI/badge.svg)
+![Linux](https://github.com/daniel86/regen/actions/workflows/ci-linux.yml/badge.svg)
+![MacOS](https://github.com/daniel86/regen/actions/workflows/ci-macos.yml/badge.svg)
 ![Warnings](https://img.shields.io/badge/compiler%20warnings-clean-brightgreen)
 [![Docs](https://img.shields.io/badge/docs-online-blue)](https://daniel86.github.io/regen/)
 ![Debian](https://img.shields.io/badge/debian-.deb%20package-blue)

From bac3005ed45f7f538f1a6e3bf1f6f5e2b29bc2a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Thu, 1 Jan 2026 14:33:08 +0100
Subject: [PATCH 13/15] Refactored MacOS CI into its own workflow

---
 .github/workflows/ci-macos.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml
index b719cc780d..409bc2394c 100644
--- a/.github/workflows/ci-macos.yml
+++ b/.github/workflows/ci-macos.yml
@@ -14,7 +14,6 @@ jobs:
       - uses: actions/checkout@v3
       - name: Install dependencies
         run: |
-          brew update \
           brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest
       - name: Build REGEN workspace
         run: |

From 3fd2e96a63f6ad641836079b603e43728fe3f795 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Thu, 1 Jan 2026 14:41:35 +0100
Subject: [PATCH 14/15] Added set-linux action to avoid some redundancy in
 workflow

---
 .github/actions/setup-linux/action.yml | 19 ++++++++++++++
 .github/workflows/ci-linux.yml         | 36 +++-----------------------
 2 files changed, 22 insertions(+), 33 deletions(-)
 create mode 100644 .github/actions/setup-linux/action.yml

diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
new file mode 100644
index 0000000000..b443e3a210
--- /dev/null
+++ b/.github/actions/setup-linux/action.yml
@@ -0,0 +1,19 @@
+name: Setup Linux build environment
+description: Install dependencies
+runs:
+  using: "composite"
+  steps:
+    - shell: bash
+      run: |
+        set -euo pipefail
+        sudo apt-get update -y -qq
+        sudo apt-get install libgl-dev \
+            libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
+            libboost-regex-dev libboost-timer-dev libsimde-dev \
+            libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
+            libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
+            libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
+            libpng-dev libalut-dev \
+            qtbase5-dev libgtest-dev \
+            doxygen graphviz \
+            nlohmann-json3-dev
diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml
index 8b57a63eb7..785792a2b5 100644
--- a/.github/workflows/ci-linux.yml
+++ b/.github/workflows/ci-linux.yml
@@ -12,21 +12,11 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
     - uses: actions/checkout@v3
+    - uses: ./.github/actions/setup-linux
     - name: Build REGEN workspace
       shell: bash
       run: |
         set -euo pipefail
-        sudo apt-get update -y -qq
-        sudo apt-get install libgl-dev \
-            libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
-            libboost-regex-dev libboost-timer-dev libsimde-dev \
-            libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
-            libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
-            libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
-            libpng-dev libalut-dev \
-            qtbase5-dev libgtest-dev \
-            doxygen graphviz \
-            nlohmann-json3-dev
         mkdir build
         cd build
         cmake ../ \
@@ -67,22 +57,12 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
     - uses: actions/checkout@v3
+    - uses: ./.github/actions/setup-linux
     - name: Create debian package
       if: ${{github.event_name == 'push' || github.event_name == 'release'}}
       shell: bash
       run: |
         set -euo pipefail
-        sudo apt-get update -y -qq
-        sudo apt-get install libgl-dev \
-            libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
-            libboost-regex-dev libboost-timer-dev libsimde-dev \
-            libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
-            libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
-            libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
-            libpng-dev libalut-dev \
-            qtbase5-dev libgtest-dev \
-            doxygen graphviz \
-            nlohmann-json3-dev
         mkdir build
         cd build
         cmake ../ -DCMAKE_BUILD_TYPE=Release
@@ -106,22 +86,12 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
     - uses: actions/checkout@v3
+    - uses: ./.github/actions/setup-linux
     - name: Run doxygen
       if: ${{github.event_name == 'push' || github.event_name == 'release'}}
       shell: bash
       run: |
         set -euo pipefail
-        sudo apt-get update -y -qq
-        sudo apt-get install libgl-dev \
-            libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \
-            libboost-regex-dev libboost-timer-dev libsimde-dev \
-            libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \
-            libglew-dev libglu1-mesa-dev libgl1-mesa-dev \
-            libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \
-            libpng-dev libalut-dev \
-            qtbase5-dev libgtest-dev \
-            doxygen graphviz \
-            nlohmann-json3-dev
         mkdir build
         cd build
         cmake ../ -DCMAKE_BUILD_TYPE=Release

From 167ab5e8678c31ee4eb1378c2c36d1545b118a0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= <danielb@uni-bremen.de>
Date: Thu, 1 Jan 2026 15:05:00 +0100
Subject: [PATCH 15/15] Some fixes

---
 CMakeLists.txt                 | 8 +++++---
 regen/compute/radix-sort-cpu.h | 4 ++--
 regen/compute/simd.h           | 6 +++---
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad4f679dbf..281b34302c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,9 +75,11 @@ endif()
 # given in the books "Effective C++" and "More Effective C++"
 # add_definitions( -Weffc++ )
 
-# -march=native enables all instruction subsets supported by the local machine
-# e.g. SSE2, SSE3, SSE4, AVX, AVX2, etc.
-add_definitions( -march=native )
+if(UNIX)
+    # -march=native enables all instruction subsets supported by the local machine
+    # e.g. SSE2, SSE3, SSE4, AVX, AVX2, etc.
+    add_definitions( -march=native )
+endif()
 
 # perform more aggressive floating-point optimizations
 # add_definitions( -ffast-math )
diff --git a/regen/compute/radix-sort-cpu.h b/regen/compute/radix-sort-cpu.h
index 3ea2b0366f..6493311f7b 100644
--- a/regen/compute/radix-sort-cpu.h
+++ b/regen/compute/radix-sort-cpu.h
@@ -140,14 +140,14 @@ namespace regen {
 						// Gather 8 keys manually, and promote to 32-bit
 						for (int k = 0; k < 8; ++k) tmpKeys32[k] = static_cast<int32_t>(keys[src[keyIdx+k]]);
 						r0 = simde_mm256_load_si256(reinterpret_cast<const simde__m256i*>(tmpKeys32));
-						r0 = simde_mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask);
+						r0 = simde_mm256_and_si256(simde_mm256_srli_epi32(r0, SHIFT), mask);
 						keyIdx += 8; // processed 8 keys, not 16!
 					}
 					else if constexpr (KEY_TYPE_BITS == 32) {
 						simd::Register_i idx = simde_mm256_loadu_si256(reinterpret_cast<const simde__m256i*>(&src[keyIdx]));
 						// Gather 8 scattered keys, and apply shift and mask to get bucket ids
 						r0 = simde_mm256_i32gather_epi32(reinterpret_cast<const int*>(keys), idx, 4);
-						r0 = simde_mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask);
+						r0 = simde_mm256_and_si256(simde_mm256_srli_epi32(r0, SHIFT), mask);
 						keyIdx += KEYS_PER_SIMD_PASS;
 					}
 					else if constexpr (KEY_TYPE_BITS == 64) {
diff --git a/regen/compute/simd.h b/regen/compute/simd.h
index ef16cbac07..ed06be8761 100644
--- a/regen/compute/simd.h
+++ b/regen/compute/simd.h
@@ -206,8 +206,8 @@ namespace regen::simd {
 
 #elif REGEN_SIMD_MODE == REGEN_SIMD_SSE
 	static constexpr int8_t RegisterMask = 0x0F; // 4 bits for SSE
-	using Register = __m128; // 4 floats
-	using Register_i = __m128i; // 4 integers
+	using Register = simde__m128; // 4 floats
+	using Register_i = simde__m128i; // 4 integers
 
 	inline Register set1_ps(float v) { return simde_mm_set1_ps(v); }
 	inline Register_i set1_epi32(int32_t v) { return simde_mm_set1_epi32(v); }
@@ -219,7 +219,7 @@ namespace regen::simd {
 	inline Register loadu_ps(const float *p) { return simde_mm_loadu_ps(p); }
 
 	inline Register_i loadu_si256(const uint32_t *p) {
-		return simde_mm_loadu_si128(reinterpret_cast<const Register_i*>(indices));
+		return simde_mm_loadu_si128(reinterpret_cast<const Register_i*>(p));
 	}
 
 	inline Register epi_to_ps(const Register_i &v) { return simde_mm_castsi128_ps(v); }