From 65ae304a24406cc58a199e63cb6d4a621b7a2507 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Tue, 30 Dec 2025 14:24:27 +0100 Subject: [PATCH 01/15] Restructured workflow --- .github/workflows/main.yml | 93 +++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 31 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c577f58841..08855db299 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -8,7 +8,8 @@ on: release: types: [published] jobs: - build: + ############################## + build-linux: runs-on: ubuntu-24.04 steps: # Note: some steps require the checkout in the root directory @@ -40,12 +41,6 @@ jobs: with: comment_title: Compilation compile_result_file: build/make-output.txt - - name: Create debian package - if: ${{github.event_name == 'push' || github.event_name == 'release'}} - shell: bash - run: | - cd build - cpack - name: Run unit tests if: ${{github.event_name == 'push' || github.event_name == 'pull_request'}} shell: bash @@ -59,12 +54,71 @@ jobs: junit_files: "gtest-regen.xml" action_fail: true action_fail_on_inconclusive: true - ##### + - name: Upload test results + uses: actions/upload-artifact@v4 + with: + name: test results + path: ./gtest-regen.xml + ############################## + debian-package: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v3 + - name: Create debian package + if: ${{github.event_name == 'push' || github.event_name == 'release'}} + shell: bash + run: | + sudo apt-get update -y -qq + sudo apt-get install libgl-dev \ + libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ + libboost-regex-dev libboost-timer-dev \ + libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ + libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ + libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ + libpng-dev libalut-dev \ + qtbase5-dev libgtest-dev \ + doxygen graphviz \ + nlohmann-json3-dev + mkdir build + cd build + cmake ../ -DCMAKE_BUILD_TYPE=Release + cpack + - name: Release debian package + if: github.event_name == 'release' + shell: bash + env: + GITHUB_TOKEN: ${{ github.TOKEN }} + run: | + gh release upload ${{github.event.release.tag_name}} ./build/regen-*.deb + - name: Upload debian package + if: github.event_name == 'push' + uses: actions/upload-artifact@v4 + with: + name: debian package + path: ./build/regen-*.deb + ############################## + doxygen: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v3 - name: Run doxygen if: ${{github.event_name == 'push' || github.event_name == 'release'}} shell: bash run: | + sudo apt-get update -y -qq + sudo apt-get install libgl-dev \ + libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ + libboost-regex-dev libboost-timer-dev \ + libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ + libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ + libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ + libpng-dev libalut-dev \ + qtbase5-dev libgtest-dev \ + doxygen graphviz \ + nlohmann-json3-dev + mkdir build cd build + cmake ../ -DCMAKE_BUILD_TYPE=Release cmake --build . --target doc cp -r ../img regen/doc/html/ - name: Extract version tag @@ -88,26 +142,3 @@ jobs: BRANCH: gh-pages # The folder the action should deploy. FOLDER: build/regen/doc/html - # The folder in the target branch - TARGET_FOLDER: ${{ env.REGEN_DOCU_VERSION }} - CLEAN: true - SINGLE_COMMIT: true - ##### - - name: Release debian package - if: github.event_name == 'release' - shell: bash - env: - GITHUB_TOKEN: ${{ github.TOKEN }} - run: | - gh release upload ${{github.event.release.tag_name}} ./build/regen-*.deb - - name: Upload debian package - if: github.event_name == 'push' - uses: actions/upload-artifact@v4 - with: - name: debian package - path: ./build/regen-*.deb - - name: Upload test results - uses: actions/upload-artifact@v4 - with: - name: test results - path: ./gtest-regen.xml From 6b2718ba68519f9c06c13e703a734fd9b32b1e32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Tue, 30 Dec 2025 15:02:16 +0100 Subject: [PATCH 02/15] Added -j$(nproc) --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 08855db299..ab10c977e9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,7 +31,7 @@ jobs: mkdir build cd build cmake ../ -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DBUILD_VIDEO_PLAYER=ON - make 2> >(tee "make-output.txt") + make -j$(nproc) 2> >(tee "make-output.txt") - name: Annotate compilation warnings/errors if: ${{github.event_name == 'pull_request'}} uses: JacobDomagala/CompileResult@master From 67057edc30c0cef32aaba94fd2dd43efeae0fbcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Tue, 30 Dec 2025 15:10:29 +0100 Subject: [PATCH 03/15] CI improvements --- .github/workflows/main.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ab10c977e9..2d67726030 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,6 +17,7 @@ jobs: - name: Build REGEN workspace shell: bash run: | + set -euo pipefail sudo apt-get update -y -qq sudo apt-get install libgl-dev \ libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ @@ -61,6 +62,7 @@ jobs: path: ./gtest-regen.xml ############################## debian-package: + needs: build-linux runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v3 @@ -68,6 +70,7 @@ jobs: if: ${{github.event_name == 'push' || github.event_name == 'release'}} shell: bash run: | + set -euo pipefail sudo apt-get update -y -qq sudo apt-get install libgl-dev \ libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ @@ -98,6 +101,7 @@ jobs: path: ./build/regen-*.deb ############################## doxygen: + needs: build-linux runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v3 @@ -105,6 +109,7 @@ jobs: if: ${{github.event_name == 'push' || github.event_name == 'release'}} shell: bash run: | + set -euo pipefail sudo apt-get update -y -qq sudo apt-get install libgl-dev \ libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ From ff4dd1398223661ff1efb6e4f140699b4f5f5d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Tue, 30 Dec 2025 15:29:42 +0100 Subject: [PATCH 04/15] Do not use -mfpmath=sse with apple --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9284168bcf..bb7c009730 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,7 +75,7 @@ endif() # given in the books "Effective C++" and "More Effective C++" # add_definitions( -Weffc++ ) -if(UNIX) # gcc options +if(UNIX AND NOT APPLE) add_definitions( -mfpmath=sse -march=native ) endif() From 5dd254f9fce76b3fcd6133f006a5268a8bee3eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Wed, 31 Dec 2025 13:35:01 +0100 Subject: [PATCH 05/15] Added more badges --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 0880beb1db..57a0ebe785 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,11 @@

![CI](https://github.com/daniel86/regen/workflows/CI/badge.svg) +![Warnings](https://img.shields.io/badge/compiler%20warnings-clean-brightgreen) +[![Docs](https://img.shields.io/badge/docs-online-blue)](https://daniel86.github.io/regen/) +![Debian](https://img.shields.io/badge/debian-.deb%20package-blue) +![GitHub release](https://img.shields.io/github/v/release/daniel86/regen?include_prereleases) +![License](https://img.shields.io/github/license/daniel86/regen) `regen` -- **Real-time Graphics Engine** -- is a modular OpenGL-based C++ engine designed for research and experimentation in real-time rendering, GPU compute, and virtual world simulation. From 8950bfef383dc662f276d45a75190f6a279613dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Wed, 31 Dec 2025 16:05:23 +0100 Subject: [PATCH 06/15] Added missing include --- regen/shader/includer.h | 1 + 1 file changed, 1 insertion(+) diff --git a/regen/shader/includer.h b/regen/shader/includer.h index fc0ea12a88..5be3ec9a50 100644 --- a/regen/shader/includer.h +++ b/regen/shader/includer.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace regen { /** From b9c6132c46d1f2e41eeb28912738c1cbd5f7465d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Thu, 1 Jan 2026 14:06:21 +0100 Subject: [PATCH 07/15] Optional unit test build --- CMakeLists.txt | 50 +++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb7c009730..ad4f679dbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,9 +75,9 @@ endif() # given in the books "Effective C++" and "More Effective C++" # add_definitions( -Weffc++ ) -if(UNIX AND NOT APPLE) - add_definitions( -mfpmath=sse -march=native ) -endif() +# -march=native enables all instruction subsets supported by the local machine +# e.g. SSE2, SSE3, SSE4, AVX, AVX2, etc. +add_definitions( -march=native ) # perform more aggressive floating-point optimizations # add_definitions( -ffast-math ) @@ -107,6 +107,10 @@ find_package(Boost ${Boost_MIN_VERSION} find_package(Threads REQUIRED) # Font library: text rendering support find_package(Freetype REQUIRED) +# Note: On Linux, it seems needed in addition to link against brotlidec +if (UNIX AND NOT APPLE) + set(FREETYPE_LIBRARIES ${FREETYPE_LIBRARIES} -lbrotlidec) +endif() # JSON library: for serialization find_package(nlohmann_json REQUIRED) set(JSON_LIBRARIES nlohmann_json::nlohmann_json) @@ -184,8 +188,10 @@ message(STATUS " Models:${ASSIMP_INCLUDE_DIRS};") message(STATUS " Fonts:${FREETYPE_INCLUDE_DIRS};") message(STATUS " Physics:${BULLET_INCLUDE_DIRS};") -enable_testing() -find_package(GTest REQUIRED) +if (BUILD_UNIT_TESTS) + enable_testing() + find_package(GTest REQUIRED) +endif () if (HAS_AV_LIBS) # allow includes without AL/ prefix. Some openAL versions require this. @@ -199,7 +205,7 @@ set(REGEN_LIBRARIES ${Boost_LIBRARIES} ${JSON_LIBRARIES} ${ASSIMP_LIBRARIES} - ${FREETYPE_LIBRARIES} -lbrotlidec + ${FREETYPE_LIBRARIES} ${IMG_LIBRARIES} ${AV_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} @@ -239,6 +245,10 @@ endif() ########### ########### +if(REGEN_EXTRA_INCLUDE_DIRS) + include_directories(${REGEN_EXTRA_INCLUDE_DIRS}) +endif() + # allow includes like even if the engine is not installed include_directories(.) # allow include of generated files @@ -260,19 +270,21 @@ install(FILES img/icon-small.png DESTINATION ${SHARE_INSTALL_PATH}/img) ########## Unit Testings ############## -# add an executable target for GTest. -# but the testing code is partly in the *knowrob* library -# where gtest won't find them without using the "--no-as-needed" -# flag for the linker. -add_executable(all_gtests - tests/gtests.cpp - tests/shapes/quad-tree-test.cpp) -target_link_libraries(all_gtests - -Wl,--whole-archive,--no-as-needed - regen - -Wl,--no-whole-archive - ${Boost_Python_COMPONENT} - ${GTEST_MAIN_LIBRARIES}) +if (BUILD_UNIT_TESTS) + # add an executable target for GTest. + # but the testing code is partly in the *knowrob* library + # where gtest won't find them without using the "--no-as-needed" + # flag for the linker. + add_executable(all_gtests + tests/gtests.cpp + tests/shapes/quad-tree-test.cpp) + target_link_libraries(all_gtests + -Wl,--whole-archive,--no-as-needed + regen + -Wl,--no-whole-archive + ${Boost_Python_COMPONENT} + ${GTEST_MAIN_LIBRARIES}) +endif () ############## ########## packaging From ff545d3bd34b2f5963203da898b6872d6bf43a06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Thu, 1 Jan 2026 14:07:54 +0100 Subject: [PATCH 08/15] Migrated to SIMDe interface --- regen/compute/radix-sort-cpu.h | 18 +- regen/compute/simd.h | 342 +++++++++++++++++---------------- regen/compute/threading.h | 6 +- 3 files changed, 187 insertions(+), 179 deletions(-) diff --git a/regen/compute/radix-sort-cpu.h b/regen/compute/radix-sort-cpu.h index 3374b8ebe4..3ea2b0366f 100644 --- a/regen/compute/radix-sort-cpu.h +++ b/regen/compute/radix-sort-cpu.h @@ -139,23 +139,23 @@ namespace regen { if constexpr (KEY_TYPE_BITS == 16) { // Gather 8 keys manually, and promote to 32-bit for (int k = 0; k < 8; ++k) tmpKeys32[k] = static_cast(keys[src[keyIdx+k]]); - r0 = _mm256_load_si256(reinterpret_cast(tmpKeys32)); - r0 = _mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask); + r0 = simde_mm256_load_si256(reinterpret_cast(tmpKeys32)); + r0 = simde_mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask); keyIdx += 8; // processed 8 keys, not 16! } else if constexpr (KEY_TYPE_BITS == 32) { - simd::Register_i idx = _mm256_loadu_si256(reinterpret_cast(&src[keyIdx])); + simd::Register_i idx = simde_mm256_loadu_si256(reinterpret_cast(&src[keyIdx])); // Gather 8 scattered keys, and apply shift and mask to get bucket ids - r0 = _mm256_i32gather_epi32(reinterpret_cast(keys), idx, 4); - r0 = _mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask); + r0 = simde_mm256_i32gather_epi32(reinterpret_cast(keys), idx, 4); + r0 = simde_mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask); keyIdx += KEYS_PER_SIMD_PASS; } else if constexpr (KEY_TYPE_BITS == 64) { // note: values have 32 bits, use __m128i to load only 4 - __m128i idx32 = _mm_loadu_si128(reinterpret_cast(&src[keyIdx])); + simde__m128i idx32 = simde_mm_loadu_si128(reinterpret_cast(&src[keyIdx])); // Gather 4 scattered keys, and apply shift and mask to get bucket ids - r0 = _mm256_i32gather_epi64(reinterpret_cast(keys), idx32, 8); - r0 = _mm256_and_si256(_mm256_srli_epi64(r0, SHIFT), mask); + r0 = simde_mm256_i32gather_epi64(reinterpret_cast(keys), idx32, 8); + r0 = simde_mm256_and_si256(simde_mm256_srli_epi64(r0, SHIFT), mask); keyIdx += KEYS_PER_SIMD_PASS; } else { @@ -163,7 +163,7 @@ namespace regen { break; } // Store results into tmpBins_ and increment histogram - _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmpBins_), r0); + simde_mm256_storeu_si256(reinterpret_cast(tmpBins_), r0); for (auto x : tmpBins_) ++histogram_[x]; } } diff --git a/regen/compute/simd.h b/regen/compute/simd.h index b47af9a8e8..ef16cbac07 100644 --- a/regen/compute/simd.h +++ b/regen/compute/simd.h @@ -4,21 +4,31 @@ #include #include -// NOTE: Check for REGEN_HAS_SIMD, if it is not defined, the SIMD operations will be disabled -// and the code here will fall back to scalar operations. -// NOLINTBEGIN(portability-simd-intrinsics) -#if defined(__AVX__) - #include // AVX - #define REGEN_SIMD_MODE AVX - #define REGEN_SIMD_WIDTH 8 - #define REGEN_HAS_SIMD -#elif defined(__SSE__) - #include // SSE - #define REGEN_SIMD_MODE SSE - #define REGEN_SIMD_WIDTH 4 - #define REGEN_HAS_SIMD +#define SIMDE_ENABLE_NATIVE_ALIASES +#include +#include +#include +#include +#include +#include + +#define REGEN_SIMD_NONE 0 +#define REGEN_SIMD_SSE 1 +#define REGEN_SIMD_AVX 2 + +#if defined(SIMDE_NATURAL_VECTOR_SIZE) + #if SIMDE_NATURAL_VECTOR_SIZE >= 32 + #define REGEN_SIMD_MODE REGEN_SIMD_AVX + #define REGEN_SIMD_WIDTH 8 + #elif SIMDE_NATURAL_VECTOR_SIZE >= 16 + #define REGEN_SIMD_MODE REGEN_SIMD_SSE + #define REGEN_SIMD_WIDTH 4 + #else + #define REGEN_SIMD_MODE REGEN_SIMD_NONE + #define REGEN_SIMD_WIDTH 1 + #endif #else - #define REGEN_SIMD_MODE NONE + #define REGEN_SIMD_MODE REGEN_SIMD_NONE #define REGEN_SIMD_WIDTH 1 #endif @@ -32,236 +42,236 @@ namespace regen::simd { return bitIndex; } -#if REGEN_SIMD_MODE == AVX +#if REGEN_SIMD_MODE == REGEN_SIMD_AVX static constexpr int8_t RegisterMask = 0xFF; // 8 bits for AVX - using Register = __m256; // 8 floats - using Register_i = __m256i; // 8 integers + using Register = simde__m256; // 8 floats + using Register_i = simde__m256i; // 8 integers - inline __m256 set1_ps(float v) { return _mm256_set1_ps(v); } - inline __m256i set1_epi32(int32_t v) { return _mm256_set1_epi32(v); } - inline __m256i set1_epi16(uint16_t v) { return _mm256_set1_epi16(v); } - inline __m256i set1_epi64(int64_t v) { return _mm256_set1_epi64x(v); } - inline __m256i set1_epi64u(uint64_t v) { return _mm256_set1_epi64x(v); } + inline Register set1_ps(float v) { return simde_mm256_set1_ps(v); } + inline Register_i set1_epi32(int32_t v) { return simde_mm256_set1_epi32(v); } + inline Register_i set1_epi16(uint16_t v) { return simde_mm256_set1_epi16(v); } + inline Register_i set1_epi64(int64_t v) { return simde_mm256_set1_epi64x(v); } + inline Register_i set1_epi64u(uint64_t v) { return simde_mm256_set1_epi64x(v); } - inline __m256 setzero_ps() { return _mm256_setzero_ps(); } - inline __m256i setzero_si256() { return _mm256_setzero_si256(); } + inline Register setzero_ps() { return simde_mm256_setzero_ps(); } + inline Register_i setzero_si256() { return simde_mm256_setzero_si256(); } - inline __m256 load_ps(const float *p) { return _mm256_load_ps(p); } - inline __m256 loadu_ps(const float *p) { return _mm256_loadu_ps(p); } + inline Register load_ps(const float *p) { return simde_mm256_load_ps(p); } + inline Register loadu_ps(const float *p) { return simde_mm256_loadu_ps(p); } - inline __m256i load_si256(const uint16_t *p) { - return _mm256_load_si256(reinterpret_cast(p)); + inline Register_i load_si256(const uint16_t *p) { + return simde_mm256_load_si256(reinterpret_cast(p)); } - inline __m256i load_si256(const uint32_t *p) { - return _mm256_load_si256(reinterpret_cast(p)); + inline Register_i load_si256(const uint32_t *p) { + return simde_mm256_load_si256(reinterpret_cast(p)); } - inline __m256i load_si256(const uint64_t *p) { - return _mm256_load_si256(reinterpret_cast(p)); + inline Register_i load_si256(const uint64_t *p) { + return simde_mm256_load_si256(reinterpret_cast(p)); } - inline __m256i load_si256(const int32_t *p) { - return _mm256_load_si256(reinterpret_cast(p)); + inline Register_i load_si256(const int32_t *p) { + return simde_mm256_load_si256(reinterpret_cast(p)); } - inline __m256i loadu_si256(const uint16_t *p) { - return _mm256_loadu_si256(reinterpret_cast(p)); + inline Register_i loadu_si256(const uint16_t *p) { + return simde_mm256_loadu_si256(reinterpret_cast(p)); } - inline __m256i loadu_si256(const uint32_t *p) { - return _mm256_loadu_si256(reinterpret_cast(p)); + inline Register_i loadu_si256(const uint32_t *p) { + return simde_mm256_loadu_si256(reinterpret_cast(p)); } - inline __m256i loadu_si256(const uint64_t *p) { - return _mm256_loadu_si256(reinterpret_cast(p)); + inline Register_i loadu_si256(const uint64_t *p) { + return simde_mm256_loadu_si256(reinterpret_cast(p)); } - inline __m256i loadu_si256(const int32_t *p) { - return _mm256_loadu_si256(reinterpret_cast(p)); + inline Register_i loadu_si256(const int32_t *p) { + return simde_mm256_loadu_si256(reinterpret_cast(p)); } - inline __m256 epi_to_ps(const __m256i &v) { return _mm256_castsi256_ps(v); } + inline Register epi_to_ps(const Register_i &v) { return simde_mm256_castsi256_ps(v); } - inline __m256 i32gather_ps(const float *p, const __m256i &indices) { - return _mm256_i32gather_ps(p, indices, sizeof(float)); + inline Register i32gather_ps(const float *p, const Register_i &indices) { + return simde_mm256_i32gather_ps(p, indices, sizeof(float)); } - inline void storeu_ps(float *p, const __m256 &v) { _mm256_storeu_ps(p, v); } - inline void store_ps(float *p, const __m256 &v) { _mm256_store_ps(p, v); } + inline void storeu_ps(float *p, const Register &v) { simde_mm256_storeu_ps(p, v); } + inline void store_ps(float *p, const Register &v) { simde_mm256_store_ps(p, v); } - inline void storeu_epi32(int32_t *p, const __m256i &v) { - _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v); + inline void storeu_epi32(int32_t *p, const Register_i &v) { + simde_mm256_storeu_si256(reinterpret_cast(p), v); } - inline void storeu_epi32(uint32_t *p, const __m256i &v) { - _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v); + inline void storeu_epi32(uint32_t *p, const Register_i &v) { + simde_mm256_storeu_si256(reinterpret_cast(p), v); } - inline void store_epi32(int32_t *p, const __m256i &v) { - _mm256_store_si256(reinterpret_cast<__m256i*>(p), v); + inline void store_epi32(int32_t *p, const Register_i &v) { + simde_mm256_store_si256(reinterpret_cast(p), v); } - inline void store_epi32(uint32_t *p, const __m256i &v) { - _mm256_store_si256(reinterpret_cast<__m256i*>(p), v); + inline void store_epi32(uint32_t *p, const Register_i &v) { + simde_mm256_store_si256(reinterpret_cast(p), v); } - inline __m256 add_ps(const __m256 &a, const __m256 &b) { return _mm256_add_ps(a, b); } - inline __m256 sub_ps(const __m256 &a, const __m256 &b) { return _mm256_sub_ps(a, b); } - inline __m256 mul_ps(const __m256 &a, const __m256 &b) { return _mm256_mul_ps(a, b); } - inline __m256 div_ps(const __m256 &a, const __m256 &b) { return _mm256_div_ps(a, b); } + inline Register add_ps(const Register &a, const Register &b) { return simde_mm256_add_ps(a, b); } + inline Register sub_ps(const Register &a, const Register &b) { return simde_mm256_sub_ps(a, b); } + inline Register mul_ps(const Register &a, const Register &b) { return simde_mm256_mul_ps(a, b); } + inline Register div_ps(const Register &a, const Register &b) { return simde_mm256_div_ps(a, b); } /** * Fused multiply-add: (a * b) + c */ - inline __m256 mul_add_ps(const __m256 &a, const __m256 &b, const __m256 &c) { - return _mm256_fmadd_ps(a, b, c); + inline Register mul_add_ps(const Register &a, const Register &b, const Register &c) { + return simde_mm256_fmadd_ps(a, b, c); } - inline __m256i add_epi32(const __m256i &a, const __m256i &b) { return _mm256_add_epi32(a, b); } - inline __m256i sub_epi32(const __m256i &a, const __m256i &b) { return _mm256_sub_epi32(a, b); } - inline __m256i mul_epi32(const __m256i &a, const __m256i &b) { return _mm256_mullo_epi32(a, b); } + inline Register_i add_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_add_epi32(a, b); } + inline Register_i sub_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_sub_epi32(a, b); } + inline Register_i mul_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_mullo_epi32(a, b); } - inline __m256 min_ps(const __m256 &a, const __m256 &b) { return _mm256_min_ps(a, b); } - inline __m256 max_ps(const __m256 &a, const __m256 &b) { return _mm256_max_ps(a, b); } - inline __m256 sqrt_ps(const __m256 &a) { return _mm256_sqrt_ps(a); } + inline Register min_ps(const Register &a, const Register &b) { return simde_mm256_min_ps(a, b); } + inline Register max_ps(const Register &a, const Register &b) { return simde_mm256_max_ps(a, b); } + inline Register sqrt_ps(const Register &a) { return simde_mm256_sqrt_ps(a); } - inline __m256i min_epi32(const __m256i &a, const __m256i &b) { return _mm256_min_epi32(a, b); } - inline __m256i max_epi32(const __m256i &a, const __m256i &b) { return _mm256_max_epi32(a, b); } + inline Register_i min_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_min_epi32(a, b); } + inline Register_i max_epi32(const Register_i &a, const Register_i &b) { return simde_mm256_max_epi32(a, b); } /** - * Horizontal sum of all elements in an __m256 + * Horizontal sum of all elements in an Register */ - inline float hsum_ps(__m256 v) { - __m128 vlow = _mm256_castps256_ps128(v); // low 128 - __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128 - __m128 sum = _mm_add_ps(vlow, vhigh); // add low and high parts - __m128 shuf = _mm_movehdup_ps(sum); // (sum.y, sum.y, sum.w, sum.w) - __m128 sums = _mm_add_ps(sum, shuf); - shuf = _mm_movehl_ps(shuf, sums); // high half of sums - sums = _mm_add_ss(sums, shuf); - return _mm_cvtss_f32(sums); + inline float hsum_ps(Register v) { + simde__m128 vlow = simde_mm256_castps256_ps128(v); // low 128 + simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1); // high 128 + simde__m128 sum = simde_mm_add_ps(vlow, vhigh); // add low and high parts + simde__m128 shuf = simde_mm_movehdup_ps(sum); // (sum.y, sum.y, sum.w, sum.w) + simde__m128 sums = simde_mm_add_ps(sum, shuf); + shuf = simde_mm_movehl_ps(shuf, sums); // high half of sums + sums = simde_mm_add_ss(sums, shuf); + return simde_mm_cvtss_f32(sums); } /** - * Horizontal min of all elements in an __m256 + * Horizontal min of all elements in an Register */ - inline float hmin_ps(__m256 v) { - __m128 vlow = _mm256_castps256_ps128(v); // low 128 - __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128 - __m128 min = _mm_min_ps(vlow, vhigh); // min low and high parts - __m128 shuf = _mm_movehdup_ps(min); // (min.y, min.y, min.w, min.w) - __m128 mins = _mm_min_ps(min, shuf); - shuf = _mm_movehl_ps(shuf, mins); // high half of mins - mins = _mm_min_ss(mins, shuf); - return _mm_cvtss_f32(mins); + inline float hmin_ps(Register v) { + simde__m128 vlow = simde_mm256_castps256_ps128(v); // low 128 + simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1); // high 128 + simde__m128 min = simde_mm_min_ps(vlow, vhigh); // min low and high parts + simde__m128 shuf = simde_mm_movehdup_ps(min); // (min.y, min.y, min.w, min.w) + simde__m128 mins = simde_mm_min_ps(min, shuf); + shuf = simde_mm_movehl_ps(shuf, mins); // high half of mins + mins = simde_mm_min_ss(mins, shuf); + return simde_mm_cvtss_f32(mins); } /** - * Horizontal max of all elements in an __m256 + * Horizontal max of all elements in an Register */ - inline float hmax_ps(__m256 v) { - __m128 vlow = _mm256_castps256_ps128(v); // low 128 - __m128 vhigh = _mm256_extractf128_ps(v, 1); // high 128 - __m128 max = _mm_max_ps(vlow, vhigh); // max low and high parts - __m128 shuf = _mm_movehdup_ps(max); // (max.y, max.y, max.w, max.w) - __m128 maxs = _mm_max_ps(max, shuf); - shuf = _mm_movehl_ps(shuf, maxs); // high half of maxs - maxs = _mm_max_ss(maxs, shuf); - return _mm_cvtss_f32(maxs); + inline float hmax_ps(Register v) { + simde__m128 vlow = simde_mm256_castps256_ps128(v); // low 128 + simde__m128 vhigh = simde_mm256_extractf128_ps(v, 1); // high 128 + simde__m128 max = simde_mm_max_ps(vlow, vhigh); // max low and high parts + simde__m128 shuf = simde_mm_movehdup_ps(max); // (max.y, max.y, max.w, max.w) + simde__m128 maxs = simde_mm_max_ps(max, shuf); + shuf = simde_mm_movehl_ps(shuf, maxs); // high half of maxs + maxs = simde_mm_max_ss(maxs, shuf); + return simde_mm_cvtss_f32(maxs); } - inline __m256 rcp_ps(const __m256 &a) { return _mm256_rcp_ps(a); } + inline Register rcp_ps(const Register &a) { return simde_mm256_rcp_ps(a); } - inline __m256 cmp_lt(const __m256 &a, const __m256 &b) { - return _mm256_cmp_ps(a, b, _CMP_LT_OQ); + inline Register cmp_lt(const Register &a, const Register &b) { + return simde_mm256_cmp_ps(a, b, _CMP_LT_OQ); } - inline __m256 cmp_gt(const __m256 &a, const __m256 &b) { - return _mm256_cmp_ps(a, b, _CMP_GT_OQ); + inline Register cmp_gt(const Register &a, const Register &b) { + return simde_mm256_cmp_ps(a, b, _CMP_GT_OQ); } - inline __m256 cmp_eq(const __m256 &a, const __m256 &b) { - return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); + inline Register cmp_eq(const Register &a, const Register &b) { + return simde_mm256_cmp_ps(a, b, _CMP_EQ_OQ); } - inline __m256 cmp_neq(const __m256 &a, const __m256 &b) { - return _mm256_cmp_ps(a, b, _CMP_NEQ_OQ); + inline Register cmp_neq(const Register &a, const Register &b) { + return simde_mm256_cmp_ps(a, b, _CMP_NEQ_OQ); } - inline __m256 cmp_or(const __m256 &a, const __m256 &b) { - return _mm256_or_ps(a, b); + inline Register cmp_or(const Register &a, const Register &b) { + return simde_mm256_or_ps(a, b); } - inline __m256 cmp_and(const __m256 &a, const __m256 &b) { - return _mm256_and_ps(a, b); + inline Register cmp_and(const Register &a, const Register &b) { + return simde_mm256_and_ps(a, b); } - inline __m256 cmp_and_not(const __m256 &a, const __m256 &b) { - return _mm256_andnot_ps(a, b); + inline Register cmp_and_not(const Register &a, const Register &b) { + return simde_mm256_andnot_ps(a, b); } - inline __m256i cvttps_epi32(const __m256 &a) { return _mm256_cvttps_epi32(a); } + inline Register_i cvttps_epi32(const Register &a) { return simde_mm256_cvttps_epi32(a); } - inline int movemask_ps(const __m256 &v) { return _mm256_movemask_ps(v); } + inline int movemask_ps(const Register &v) { return simde_mm256_movemask_ps(v); } - inline __m256 blendv_ps(const __m256 &a, const __m256 &b, const __m256 &mask) { - return _mm256_blendv_ps(a, b, mask); + inline Register blendv_ps(const Register &a, const Register &b, const Register &mask) { + return simde_mm256_blendv_ps(a, b, mask); } -#elif REGEN_SIMD_MODE == SSE +#elif REGEN_SIMD_MODE == REGEN_SIMD_SSE static constexpr int8_t RegisterMask = 0x0F; // 4 bits for SSE using Register = __m128; // 4 floats using Register_i = __m128i; // 4 integers - inline __m128 set1_ps(float v) { return _mm_set1_ps(v); } - inline __m128i set1_epi32(int32_t v) { return _mm_set1_epi32(v); } + inline Register set1_ps(float v) { return simde_mm_set1_ps(v); } + inline Register_i set1_epi32(int32_t v) { return simde_mm_set1_epi32(v); } - inline __m128 setzero_ps() { return _mm_setzero_ps(); } - inline __m128i setzero_si256() { return _mm_setzero_si128(); } + inline Register setzero_ps() { return simde_mm_setzero_ps(); } + inline Register_i setzero_si256() { return simde_mm_setzero_si128(); } - inline __m128 load_ps(const float *p) { return _mm_load_ps(p); } - inline __m128 loadu_ps(const float *p) { return _mm_loadu_ps(p); } + inline Register load_ps(const float *p) { return simde_mm_load_ps(p); } + inline Register loadu_ps(const float *p) { return simde_mm_loadu_ps(p); } - inline __m128i loadu_si256(const uint32_t *p) { - return _mm_loadu_si128(reinterpret_cast(indices)); + inline Register_i loadu_si256(const uint32_t *p) { + return simde_mm_loadu_si128(reinterpret_cast(indices)); } - inline __m128 epi_to_ps(const __m128i &v) { return _mm_castsi128_ps(v); } + inline Register epi_to_ps(const Register_i &v) { return simde_mm_castsi128_ps(v); } - inline __m128 i32gather_ps(const float *p, const __m128i &indices) { - return _mm_i32gather_ps(p, indices, sizeof(float)); + inline Register i32gather_ps(const float *p, const Register_i &indices) { + return simde_mm_i32gather_ps(p, indices, sizeof(float)); } - inline void storeu_ps(float *p, const __m128 &v) { _mm_storeu_ps(p, v); } + inline void storeu_ps(float *p, const Register &v) { simde_mm_storeu_ps(p, v); } - inline __m128 add_ps(const __m128 &a, const __m128 &b) { return _mm_add_ps(a, b); } - inline __m128 sub_ps(const __m128 &a, const __m128 &b) { return _mm_sub_ps(a, b); } - inline __m128 mul_ps(const __m128 &a, const __m128 &b) { return _mm_mul_ps(a, b); } - inline __m128 div_ps(const __m128 &a, const __m128 &b) { return _mm_div_ps(a, b); } + inline Register add_ps(const Register &a, const Register &b) { return simde_mm_add_ps(a, b); } + inline Register sub_ps(const Register &a, const Register &b) { return simde_mm_sub_ps(a, b); } + inline Register mul_ps(const Register &a, const Register &b) { return simde_mm_mul_ps(a, b); } + inline Register div_ps(const Register &a, const Register &b) { return simde_mm_div_ps(a, b); } - inline __m128i add_epi32(const __m128i &a, const __m128i &b) { return _mm_add_epi32(a, b); } - inline __m128i sub_epi32(const __m128i &a, const __m128i &b) { return _mm_sub_epi32(a, b); } - inline __m128i mul_epi32(const __m128i &a, const __m128i &b) { return _mm_mullo_epi32(a, b); } + inline Register_i add_epi32(const Register_i &a, const Register_i &b) { return simde_mm_add_epi32(a, b); } + inline Register_i sub_epi32(const Register_i &a, const Register_i &b) { return simde_mm_sub_epi32(a, b); } + inline Register_i mul_epi32(const Register_i &a, const Register_i &b) { return simde_mm_mullo_epi32(a, b); } - inline __m128 min_ps(const __m128 &a, const __m128 &b) { return _mm_min_ps(a, b); } - inline __m128 max_ps(const __m128 &a, const __m128 &b) { return _mm_max_ps(a, b); } - inline __m128 sqrt_ps(const __m128 &a) { return _mm_sqrt_ps(a); } + inline Register min_ps(const Register &a, const Register &b) { return simde_mm_min_ps(a, b); } + inline Register max_ps(const Register &a, const Register &b) { return simde_mm_max_ps(a, b); } + inline Register sqrt_ps(const Register &a) { return simde_mm_sqrt_ps(a); } - inline __m128i min_epi32(const __m128i &a, const __m128i &b) { return _mm_min_epi32(a, b); } - inline __m128i max_epi32(const __m128i &a, const __m128i &b) { return _mm_max_epi32(a, b); } + inline Register_i min_epi32(const Register_i &a, const Register_i &b) { return simde_mm_min_epi32(a, b); } + inline Register_i max_epi32(const Register_i &a, const Register_i &b) { return simde_mm_max_epi32(a, b); } - inline float hsum_ps(__m128 v) { - __m128 shuf = _mm_movehdup_ps(v); // (v1, v1, v3, v3) - __m128 sums = _mm_add_ps(v, shuf); - shuf = _mm_movehl_ps(shuf, sums); // (v2 + v3, v3, -, -) - sums = _mm_add_ss(sums, shuf); - return _mm_cvtss_f32(sums); + inline float hsum_ps(Register v) { + Register shuf = simde_mm_movehdup_ps(v); // (v1, v1, v3, v3) + Register sums = simde_mm_add_ps(v, shuf); + shuf = simde_mm_movehl_ps(shuf, sums); // (v2 + v3, v3, -, -) + sums = simde_mm_add_ss(sums, shuf); + return simde_mm_cvtss_f32(sums); } - inline __m128 rcp_ps(const __m128 &a) { return _mm_rcp_ps(a); } + inline Register rcp_ps(const Register &a) { return simde_mm_rcp_ps(a); } - inline __m128 cmp_lt(const __m128 &a, const __m128 &b) { return _mm_cmplt_ps(a, b); } - inline __m128 cmp_gt(const __m128 &a, const __m128 &b) { return _mm_cmplt_ps(b, a); } - inline __m128 cmp_eq(const __m128 &a, const __m128 &b) { return _mm_cmpeq_ps(a, b); } - inline __m128 cmp_neq(const __m128 &a, const __m128 &b) { - __m128 eq = _mm_cmpeq_ps(a, b); - return _mm_andnot_ps(eq, _mm_castsi128_ps(_mm_set1_epi32(-1))); // ~eq & all_ones + inline Register cmp_lt(const Register &a, const Register &b) { return simde_mm_cmplt_ps(a, b); } + inline Register cmp_gt(const Register &a, const Register &b) { return simde_mm_cmplt_ps(b, a); } + inline Register cmp_eq(const Register &a, const Register &b) { return simde_mm_cmpeq_ps(a, b); } + inline Register cmp_neq(const Register &a, const Register &b) { + Register eq = simde_mm_cmpeq_ps(a, b); + return simde_mm_andnot_ps(eq, simde_mm_castsi128_ps(simde_mm_set1_epi32(-1))); // ~eq & all_ones } - inline __m128 cmp_or(const __m128 &a, const __m128 &b) { return _mm_or_ps(a, b); } - inline __m128 cmp_and(const __m128 &a, const __m128 &b) { return _mm_and_ps(a, b); } + inline Register cmp_or(const Register &a, const Register &b) { return simde_mm_or_ps(a, b); } + inline Register cmp_and(const Register &a, const Register &b) { return simde_mm_and_ps(a, b); } - inline __m128i cvttps_epi32(const __m128 &a) { return _mm_cvttps_epi32(a); } + inline Register_i cvttps_epi32(const Register &a) { return simde_mm_cvttps_epi32(a); } - inline int movemask_ps(const __m128 &v) { return _mm_movemask_ps(v); } + inline int movemask_ps(const Register &v) { return simde_mm_movemask_ps(v); } - inline __m128 blendv_ps(const __m128 &a, const __m128 &b, const __m128 &mask) { - return _mm_blendv_ps(a, b, mask); + inline Register blendv_ps(const Register &a, const Register &b, const Register &mask) { + return simde_mm_blendv_ps(a, b, mask); } #else // Fallback to scalar operations @@ -752,7 +762,7 @@ namespace regen { } static BatchOf_int32 castFloatBatch(const BatchOf_float &v) { - return BatchOf_int32{_mm256_castps_si256(v.c)}; + return BatchOf_int32{simde_mm256_castps_si256(v.c)}; } /** @@ -832,7 +842,7 @@ namespace regen { } BatchOf_int32 operator&(const BatchOf_int32 &other) const { - return BatchOf_int32{_mm256_and_si256(c, other.c)}; + return BatchOf_int32{simde_mm256_and_si256(c, other.c)}; } static BatchOf_int32 allZeros() { @@ -1199,7 +1209,7 @@ namespace regen { /** * Computes the length squared of each vector in the batch. - * @return __m128 containing the length squared for each vector. + * @return Register containing the length squared for each vector. */ BatchOf_float lengthSquared() const { return x*x + y*y + z*z; @@ -1272,6 +1282,4 @@ namespace regen { template using vectorSIMD = std::vector>; } -// NOLINTEND(portability-simd-intrinsics) - #endif /* REGEN_SIMD_H_ */ diff --git a/regen/compute/threading.h b/regen/compute/threading.h index 250905e299..9a2d5c2082 100644 --- a/regen/compute/threading.h +++ b/regen/compute/threading.h @@ -9,8 +9,8 @@ #include "regen/memory/aligned-allocator.h" #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) - #include - #define CPU_PAUSE() _mm_pause() + #include + #define CPU_PAUSE() simde_mm_pause() #elif defined(__aarch64__) || defined(__arm__) #define CPU_PAUSE() asm volatile("yield" ::: "memory") #else @@ -373,7 +373,7 @@ namespace regen { #if 0 int spins = 0; while (numJobsRemaining_.load(std::memory_order_acquire) > 0u) { - if (++spins < 1000) _mm_pause(); + if (++spins < 1000) simde_mm_pause(); else std::this_thread::yield(); } #else From f8b2446061f995a58ccb0d778df354821ecaa41e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Thu, 1 Jan 2026 14:08:11 +0100 Subject: [PATCH 09/15] Added macos build --- .github/workflows/main.yml | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2d67726030..aad0b6884f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,7 +12,6 @@ jobs: build-linux: runs-on: ubuntu-24.04 steps: - # Note: some steps require the checkout in the root directory - uses: actions/checkout@v3 - name: Build REGEN workspace shell: bash @@ -31,13 +30,16 @@ jobs: nlohmann-json3-dev mkdir build cd build - cmake ../ -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DBUILD_VIDEO_PLAYER=ON + cmake ../ \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_UNIT_TESTS=ON \ + -DBUILD_TESTS=ON \ + -DBUILD_VIDEO_PLAYER=ON make -j$(nproc) 2> >(tee "make-output.txt") - name: Annotate compilation warnings/errors if: ${{github.event_name == 'pull_request'}} uses: JacobDomagala/CompileResult@master # just so that in case this step fails, the workflow doesn't stop. - # this is done as it is unclear how well the action is maintained. continue-on-error: true with: comment_title: Compilation @@ -61,6 +63,25 @@ jobs: name: test results path: ./gtest-regen.xml ############################## + build-macos: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - name: Install dependencies + run: brew install cmake gcc simde boost assimp openal-soft freetype bullet \ + glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest + - name: Build REGEN workspace + run: | + mkdir build + cd build + cmake \ + -DCMAKE_C_COMPILER=/opt/homebrew/bin/gcc-14 -DCMAKE_CXX_COMPILER=/opt/homebrew/bin/g++-14 \ + -DCMAKE_PREFIX_PATH="/opt/homebrew/opt/qt@5" \ + -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON \ + -DREGEN_EXTRA_INCLUDE_DIRS="/opt/homebrew/opt/openal-soft/include/" \ + ../ + make + ############################## debian-package: needs: build-linux runs-on: ubuntu-24.04 From e876c04ca8760538a0b50c5698e613f9d47a9620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Thu, 1 Jan 2026 14:09:25 +0100 Subject: [PATCH 10/15] Added macos build --- .github/workflows/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index aad0b6884f..e6b5314ab2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -68,8 +68,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Install dependencies - run: brew install cmake gcc simde boost assimp openal-soft freetype bullet \ - glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest + run: brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest - name: Build REGEN workspace run: | mkdir build From 3ad96dd3fd0ed8ec9c89cd73e11407b396ab1286 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Thu, 1 Jan 2026 14:14:51 +0100 Subject: [PATCH 11/15] Added simde dependency --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e6b5314ab2..09cd378dc9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,7 +20,7 @@ jobs: sudo apt-get update -y -qq sudo apt-get install libgl-dev \ libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ - libboost-regex-dev libboost-timer-dev \ + libboost-regex-dev libboost-timer-dev libsimde-dev \ libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ @@ -94,7 +94,7 @@ jobs: sudo apt-get update -y -qq sudo apt-get install libgl-dev \ libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ - libboost-regex-dev libboost-timer-dev \ + libboost-regex-dev libboost-timer-dev libsimde-dev \ libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ @@ -133,7 +133,7 @@ jobs: sudo apt-get update -y -qq sudo apt-get install libgl-dev \ libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ - libboost-regex-dev libboost-timer-dev \ + libboost-regex-dev libboost-timer-dev libsimde-dev \ libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ From 3647bc682edb2ef6e0f99088182285b8899b662f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Thu, 1 Jan 2026 14:32:20 +0100 Subject: [PATCH 12/15] Refactored MacOS CI into its own workflow --- .github/workflows/{main.yml => ci-linux.yml} | 21 +------------- .github/workflows/ci-macos.yml | 30 ++++++++++++++++++++ README.md | 3 +- 3 files changed, 33 insertions(+), 21 deletions(-) rename .github/workflows/{main.yml => ci-linux.yml} (88%) create mode 100644 .github/workflows/ci-macos.yml diff --git a/.github/workflows/main.yml b/.github/workflows/ci-linux.yml similarity index 88% rename from .github/workflows/main.yml rename to .github/workflows/ci-linux.yml index 09cd378dc9..8b57a63eb7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/ci-linux.yml @@ -1,5 +1,4 @@ - -name: CI +name: CI Linux on: push: branches: [ dev ] @@ -63,24 +62,6 @@ jobs: name: test results path: ./gtest-regen.xml ############################## - build-macos: - runs-on: macos-latest - steps: - - uses: actions/checkout@v3 - - name: Install dependencies - run: brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest - - name: Build REGEN workspace - run: | - mkdir build - cd build - cmake \ - -DCMAKE_C_COMPILER=/opt/homebrew/bin/gcc-14 -DCMAKE_CXX_COMPILER=/opt/homebrew/bin/g++-14 \ - -DCMAKE_PREFIX_PATH="/opt/homebrew/opt/qt@5" \ - -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON \ - -DREGEN_EXTRA_INCLUDE_DIRS="/opt/homebrew/opt/openal-soft/include/" \ - ../ - make - ############################## debian-package: needs: build-linux runs-on: ubuntu-24.04 diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml new file mode 100644 index 0000000000..b719cc780d --- /dev/null +++ b/.github/workflows/ci-macos.yml @@ -0,0 +1,30 @@ +name: CI MacOS +on: + push: + branches: [ dev ] + pull_request: + branches: [ dev ] + release: + types: [published] + +jobs: + build-macos: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - name: Install dependencies + run: | + brew update \ + brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest + - name: Build REGEN workspace + run: | + mkdir build + cd build + cmake \ + -DCMAKE_C_COMPILER=/opt/homebrew/bin/gcc-14 -DCMAKE_CXX_COMPILER=/opt/homebrew/bin/g++-14 \ + -DCMAKE_PREFIX_PATH="/opt/homebrew/opt/qt@5" \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_TESTS=ON \ + -DREGEN_EXTRA_INCLUDE_DIRS="/opt/homebrew/opt/openal-soft/include/" \ + ../ + make diff --git a/README.md b/README.md index 57a0ebe785..6ba1d8645b 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,8 @@

-![CI](https://github.com/daniel86/regen/workflows/CI/badge.svg) +![Linux](https://github.com/daniel86/regen/actions/workflows/ci-linux.yml/badge.svg) +![MacOS](https://github.com/daniel86/regen/actions/workflows/ci-macos.yml/badge.svg) ![Warnings](https://img.shields.io/badge/compiler%20warnings-clean-brightgreen) [![Docs](https://img.shields.io/badge/docs-online-blue)](https://daniel86.github.io/regen/) ![Debian](https://img.shields.io/badge/debian-.deb%20package-blue) From bac3005ed45f7f538f1a6e3bf1f6f5e2b29bc2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Thu, 1 Jan 2026 14:33:08 +0100 Subject: [PATCH 13/15] Refactored MacOS CI into its own workflow --- .github/workflows/ci-macos.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml index b719cc780d..409bc2394c 100644 --- a/.github/workflows/ci-macos.yml +++ b/.github/workflows/ci-macos.yml @@ -14,7 +14,6 @@ jobs: - uses: actions/checkout@v3 - name: Install dependencies run: | - brew update \ brew install cmake gcc simde boost assimp openal-soft freetype bullet glew qt5 doxygen graphviz devil ffmpeg alut nlohmann-json googletest - name: Build REGEN workspace run: | From 3fd2e96a63f6ad641836079b603e43728fe3f795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Thu, 1 Jan 2026 14:41:35 +0100 Subject: [PATCH 14/15] Added set-linux action to avoid some redundancy in workflow --- .github/actions/setup-linux/action.yml | 19 ++++++++++++++ .github/workflows/ci-linux.yml | 36 +++----------------------- 2 files changed, 22 insertions(+), 33 deletions(-) create mode 100644 .github/actions/setup-linux/action.yml diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml new file mode 100644 index 0000000000..b443e3a210 --- /dev/null +++ b/.github/actions/setup-linux/action.yml @@ -0,0 +1,19 @@ +name: Setup Linux build environment +description: Install dependencies +runs: + using: "composite" + steps: + - shell: bash + run: | + set -euo pipefail + sudo apt-get update -y -qq + sudo apt-get install libgl-dev \ + libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ + libboost-regex-dev libboost-timer-dev libsimde-dev \ + libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ + libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ + libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ + libpng-dev libalut-dev \ + qtbase5-dev libgtest-dev \ + doxygen graphviz \ + nlohmann-json3-dev diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 8b57a63eb7..785792a2b5 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -12,21 +12,11 @@ jobs: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/setup-linux - name: Build REGEN workspace shell: bash run: | set -euo pipefail - sudo apt-get update -y -qq - sudo apt-get install libgl-dev \ - libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ - libboost-regex-dev libboost-timer-dev libsimde-dev \ - libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ - libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ - libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ - libpng-dev libalut-dev \ - qtbase5-dev libgtest-dev \ - doxygen graphviz \ - nlohmann-json3-dev mkdir build cd build cmake ../ \ @@ -67,22 +57,12 @@ jobs: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/setup-linux - name: Create debian package if: ${{github.event_name == 'push' || github.event_name == 'release'}} shell: bash run: | set -euo pipefail - sudo apt-get update -y -qq - sudo apt-get install libgl-dev \ - libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ - libboost-regex-dev libboost-timer-dev libsimde-dev \ - libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ - libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ - libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ - libpng-dev libalut-dev \ - qtbase5-dev libgtest-dev \ - doxygen graphviz \ - nlohmann-json3-dev mkdir build cd build cmake ../ -DCMAKE_BUILD_TYPE=Release @@ -106,22 +86,12 @@ jobs: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/setup-linux - name: Run doxygen if: ${{github.event_name == 'push' || github.event_name == 'release'}} shell: bash run: | set -euo pipefail - sudo apt-get update -y -qq - sudo apt-get install libgl-dev \ - libboost-thread-dev libboost-system-dev libboost-date-time-dev libboost-filesystem-dev \ - libboost-regex-dev libboost-timer-dev libsimde-dev \ - libassimp-dev libopenal-dev libdevil-dev libfreetype-dev libbullet-dev \ - libglew-dev libglu1-mesa-dev libgl1-mesa-dev \ - libavcodec-dev libavformat-dev libavutil-dev libswscale-dev \ - libpng-dev libalut-dev \ - qtbase5-dev libgtest-dev \ - doxygen graphviz \ - nlohmann-json3-dev mkdir build cd build cmake ../ -DCMAKE_BUILD_TYPE=Release From 167ab5e8678c31ee4eb1378c2c36d1545b118a0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Be=C3=9Fler?= Date: Thu, 1 Jan 2026 15:05:00 +0100 Subject: [PATCH 15/15] Some fixes --- CMakeLists.txt | 8 +++++--- regen/compute/radix-sort-cpu.h | 4 ++-- regen/compute/simd.h | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ad4f679dbf..281b34302c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,9 +75,11 @@ endif() # given in the books "Effective C++" and "More Effective C++" # add_definitions( -Weffc++ ) -# -march=native enables all instruction subsets supported by the local machine -# e.g. SSE2, SSE3, SSE4, AVX, AVX2, etc. -add_definitions( -march=native ) +if(UNIX) + # -march=native enables all instruction subsets supported by the local machine + # e.g. SSE2, SSE3, SSE4, AVX, AVX2, etc. + add_definitions( -march=native ) +endif() # perform more aggressive floating-point optimizations # add_definitions( -ffast-math ) diff --git a/regen/compute/radix-sort-cpu.h b/regen/compute/radix-sort-cpu.h index 3ea2b0366f..6493311f7b 100644 --- a/regen/compute/radix-sort-cpu.h +++ b/regen/compute/radix-sort-cpu.h @@ -140,14 +140,14 @@ namespace regen { // Gather 8 keys manually, and promote to 32-bit for (int k = 0; k < 8; ++k) tmpKeys32[k] = static_cast(keys[src[keyIdx+k]]); r0 = simde_mm256_load_si256(reinterpret_cast(tmpKeys32)); - r0 = simde_mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask); + r0 = simde_mm256_and_si256(simde_mm256_srli_epi32(r0, SHIFT), mask); keyIdx += 8; // processed 8 keys, not 16! } else if constexpr (KEY_TYPE_BITS == 32) { simd::Register_i idx = simde_mm256_loadu_si256(reinterpret_cast(&src[keyIdx])); // Gather 8 scattered keys, and apply shift and mask to get bucket ids r0 = simde_mm256_i32gather_epi32(reinterpret_cast(keys), idx, 4); - r0 = simde_mm256_and_si256(_mm256_srli_epi32(r0, SHIFT), mask); + r0 = simde_mm256_and_si256(simde_mm256_srli_epi32(r0, SHIFT), mask); keyIdx += KEYS_PER_SIMD_PASS; } else if constexpr (KEY_TYPE_BITS == 64) { diff --git a/regen/compute/simd.h b/regen/compute/simd.h index ef16cbac07..ed06be8761 100644 --- a/regen/compute/simd.h +++ b/regen/compute/simd.h @@ -206,8 +206,8 @@ namespace regen::simd { #elif REGEN_SIMD_MODE == REGEN_SIMD_SSE static constexpr int8_t RegisterMask = 0x0F; // 4 bits for SSE - using Register = __m128; // 4 floats - using Register_i = __m128i; // 4 integers + using Register = simde__m128; // 4 floats + using Register_i = simde__m128i; // 4 integers inline Register set1_ps(float v) { return simde_mm_set1_ps(v); } inline Register_i set1_epi32(int32_t v) { return simde_mm_set1_epi32(v); } @@ -219,7 +219,7 @@ namespace regen::simd { inline Register loadu_ps(const float *p) { return simde_mm_loadu_ps(p); } inline Register_i loadu_si256(const uint32_t *p) { - return simde_mm_loadu_si128(reinterpret_cast(indices)); + return simde_mm_loadu_si128(reinterpret_cast(p)); } inline Register epi_to_ps(const Register_i &v) { return simde_mm_castsi128_ps(v); }