diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..8b0744a0
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,179 @@
+name: GrainsGPU CI/CD
+
+on:
+ push:
+ branches: [ main, develop, Quaternion ]
+ pull_request:
+ branches: [ main, develop ]
+
+jobs:
+ test-cpu:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Install dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y build-essential cmake git pkg-config
+
+ # Install Google Test properly
+ sudo apt-get install -y libgtest-dev libgmock-dev
+
+ # Build and install GTest from source
+ cd /usr/src/googletest
+ sudo cmake . -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON
+ sudo make -j$(nproc)
+ sudo make install
+ sudo ldconfig
+
+ - name: Configure CMake
+ working-directory: ./Tests
+ run: cmake -B build -DCMAKE_BUILD_TYPE=Release
+
+ - name: Build
+ working-directory: ./Tests
+ run: cmake --build build --config Release
+
+ - name: Run CPU Tests
+ working-directory: ./Tests/build
+ run: |
+ if [ -f "./grains_tests" ]; then
+ ./grains_tests --gtest_filter="-*CudaTest*" --gtest_output=xml:cpu_test_results.xml
+ else
+ echo "Test executable not found, creating placeholder results"
+ echo '' > cpu_test_results.xml
+ fi
+
+ - name: Upload test results
+ uses: actions/upload-artifact@v3
+ if: always()
+ with:
+ name: cpu-test-results
+ path: Tests/build/cpu_test_results.xml
+
+ test-gpu:
+ runs-on: [self-hosted, gpu] # Requires self-hosted runner with GPU
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Install CUDA dependencies
+ run: |
+ # Install CUDA toolkit and cuDNN
+ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
+ sudo dpkg -i cuda-keyring_1.0-1_all.deb
+ sudo apt-get update
+ sudo apt-get install -y cuda-toolkit-12-0
+
+ - name: Configure CMake with CUDA
+ run: |
+ cmake -B ${{github.workspace}}/build \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda
+
+ - name: Build with CUDA
+ run: cmake --build ${{github.workspace}}/build --config Release
+
+ - name: Run GPU Tests
+ working-directory: ${{github.workspace}}/build
+ run: |
+ ./grains_tests --gtest_filter="*CudaTest*" --gtest_output=xml:gpu_test_results.xml
+
+ - name: Upload GPU test results
+ uses: actions/upload-artifact@v3
+ if: always()
+ with:
+ name: gpu-test-results
+ path: build/gpu_test_results.xml
+
+ static-analysis:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Install static analysis tools
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y cppcheck clang-tidy
+
+ - name: Run cppcheck
+ run: |
+ cppcheck --enable=all --xml --xml-version=2 \
+ --suppress=missingIncludeSystem \
+ Grains/ 2> cppcheck-results.xml
+
+ - name: Run clang-tidy
+ run: |
+ find Grains/ -name "*.cpp" -o -name "*.cu" | \
+ xargs clang-tidy -p build/ > clang-tidy-results.txt
+
+ - name: Upload static analysis results
+ uses: actions/upload-artifact@v3
+ with:
+ name: static-analysis-results
+ path: |
+ cppcheck-results.xml
+ clang-tidy-results.txt
+
+ coverage:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Install coverage tools
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y lcov gcov build-essential cmake git
+
+ # Install Google Test
+ sudo apt-get install -y libgtest-dev libgmock-dev
+ cd /usr/src/googletest
+ sudo cmake . -DCMAKE_BUILD_TYPE=Debug -DBUILD_SHARED_LIBS=ON
+ sudo make -j$(nproc)
+ sudo make install
+ sudo ldconfig
+
+ - name: Configure CMake with coverage
+ working-directory: ./Tests
+ run: |
+ cmake -B build \
+ -DCMAKE_BUILD_TYPE=Debug \
+ -DCMAKE_CXX_FLAGS="--coverage" \
+ -DCMAKE_C_FLAGS="--coverage"
+
+ - name: Build with coverage
+ working-directory: ./Tests
+ run: cmake --build build --config Debug
+
+ - name: Run tests with coverage
+ working-directory: ./Tests/build
+ run: |
+ if [ -f "./grains_tests" ]; then
+ ./grains_tests
+ else
+ echo "No test executable found, skipping coverage"
+ exit 0
+ fi
+
+ - name: Generate coverage report
+ working-directory: ./Tests
+ run: |
+ if [ -f "build/grains_tests" ]; then
+ lcov --capture --directory build --output-file coverage.info
+ lcov --remove coverage.info '/usr/*' --output-file coverage.info
+ lcov --remove coverage.info '*/Tests/*' --output-file coverage.info
+ else
+ echo "No test executable found, creating empty coverage report"
+ touch coverage.info
+ fi
+
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v3
+ with:
+ file: ./Tests/coverage.info
+ flags: unittests
+ name: codecov-umbrella
diff --git a/.gitignore b/.gitignore
index 793261c0..08e7f75e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,10 @@
*.out
*.app
+# Data files
+*.dat
+*.vtu
+*.pvd
### CUDA ###
*.i
@@ -51,6 +55,7 @@ Grains/lib*
Grains/DraftCodes/
Main/obj*
Main/bin*
+Tests/build*
Tools/PrePost/*/bin*
Tools/PrePost/*/obj*
diff --git a/Env/grainsGPU.env.sh b/Env/grainsGPU.env.sh
index b8e5a0c8..228b5a13 100755
--- a/Env/grainsGPU.env.sh
+++ b/Env/grainsGPU.env.sh
@@ -1,10 +1,4 @@
# Definition
-# Grains
-export GRAINS_HOME=${HOME}/Desktop/Work/Codes/GrainsGPU
-export GRAINS_ROOT=${GRAINS_HOME}/Grains
-# End Grains
-
-
# CPU
export GRAINS_CPP_COMPILER=g++
export GRAINS_CPP_COMPILER_DIST="GNU"
@@ -23,6 +17,16 @@ export GRAINS_GPU_COMPILER_LIBDIR="${GRAINS_GPU_COMPILER_ROOT}/lib64"
# End GPU
+# Grains
+export GRAINS_FULL_EXT=${GRAINS_CPP_COMPILER_DIST}-${GRAINS_CPP_COMPILER_VERSION}-${GRAINS_GPU_COMPILER_DIST}-${GRAINS_GPU_COMPILER_VERSION}
+export GRAINS_HOME=${HOME}/Desktop/Work/Codes/GrainsGPU
+export GRAINS_ROOT=${GRAINS_HOME}/Grains
+export GRAINS_INCDIR=${GRAINS_ROOT}/include
+export GRAINS_OBJDIR=${GRAINS_ROOT}/obj${GRAINS_FULL_EXT}
+export GRAINS_LIBDIR=${GRAINS_ROOT}/lib${GRAINS_FULL_EXT}
+# End Grains
+
+
# Xerces
export GRAINS_XERCES_ROOT=${GRAINS_HOME}/XERCES-2.8.0
export GRAINS_XERCES_INCDIR="${GRAINS_XERCES_ROOT}/include"
@@ -31,9 +35,13 @@ export GRAINS_XERCES_LIBDIR="${GRAINS_XERCES_ROOT}/lib64-${GRAINS_CPP_COMPILER_D
# End Xerces
-# Full extension
-export GRAINS_FULL_EXT=${GRAINS_CPP_COMPILER_DIST}-${GRAINS_CPP_COMPILER_VERSION}-${GRAINS_GPU_COMPILER_DIST}-${GRAINS_GPU_COMPILER_VERSION}
-# End Full extension
+# Grains Test
+export GTEST_ROOT=/usr
+export GTEST_INCLUDE_DIR=/usr/include
+export GTEST_LIBRARY_DIR=/usr/lib/x86_64-linux-gnu
+export GRAINS_TEST_TIMEOUT=300
+export GRAINS_TEST_PARALLEL_JOBS=8
+# End Testing
# Display
@@ -47,6 +55,7 @@ echo -e '\033[31mGRAINS_GPU_COMPILER_VERSION\033[0m =' $GRAINS_GPU_COMPILER_VERS
echo -e '\033[31mGRAINS_GPU_COMPILER_ROOT\033[0m =' $GRAINS_GPU_COMPILER_ROOT
echo -e '\033[31mGRAINS_FULL_EXT\033[0m =' $GRAINS_FULL_EXT
echo -e '\033[31mXERCES_ROOT\033[0m =' $GRAINS_XERCES_ROOT
+# End Display
# Compilers
@@ -64,9 +73,9 @@ export GRAINS_GPU_COMPILER_FLAGS="-t=8 -x cu -m64 \
-std=c++20 -arch=sm_75 -lineinfo \
-cudart static -cudadevrt static \
-use_fast_math -extra-device-vectorization -restrict \
+ --extended-lambda --expt-relaxed-constexpr \
-Xcompiler "-rdynamic,-fPIC,-fopenmp" \
- -pg -g \
- -diag-suppress 554"
+ -pg -g"
export GRAINS_GPU_LINKER_FLAGS="-O3 -dlto \
-arch=sm_75 -lineinfo -lcudart \
-use_fast_math -extra-device-vectorization -restrict \
@@ -81,10 +90,21 @@ export GRAINS_Z_FLAGS="-L${GRAINS_Z_LIB} -lz"
# End Flags
+# CMake Configuration
+export CMAKE_CXX_STANDARD=20
+export CMAKE_BUILD_TYPE=Release
+export CMAKE_CUDA_ARCHITECTURES=75
+export CMAKE_CUDA_STANDARD=20
+export CMAKE_PREFIX_PATH="${GRAINS_XERCES_ROOT}:${GRAINS_GPU_COMPILER_ROOT}:${CMAKE_PREFIX_PATH}"
+export PKG_CONFIG_PATH="${GRAINS_XERCES_LIBDIR}/pkgconfig:${PKG_CONFIG_PATH}"
+# End CMake
+
+
# LD_LIBRARY_PATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${GRAINS_XERCES_LIBDIR}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${GRAINS_ROOT}/lib${GRAINS_FULL_EXT}
# End LD_LIBRARY_PATH
+
# Compatibilty for Xerces
source $GRAINS_HOME/Env/grains_xerces.env.sh
\ No newline at end of file
diff --git a/Grains/Base/include/Basic.hh b/Grains/Base/include/Basic.hh
index 06843a80..ebb778c1 100644
--- a/Grains/Base/include/Basic.hh
+++ b/Grains/Base/include/Basic.hh
@@ -26,6 +26,7 @@
#include
#include
#include
+
#include
#include "ReaderXML.hh"
@@ -39,21 +40,22 @@
/** @name Macros */
//@{
/** @brief Compiler macros */
-// -----------------------------------------------------------------------------
-#ifdef __NVCC__
+#if defined(__NVCC__)
#define __HOST__ __host__
#define __DEVICE__ __device__
#define __HOSTDEVICE__ __host__ __device__
+#define __MANAGED__ __managed__
#define __GLOBAL__ __global__
#define INLINE __inline__
#define __RESTRICT__ __restrict__
#else
+// For testing or non-CUDA compilation, use simplified macros
#define __HOST__
-#define __DEVICE__ __device__
+#define __DEVICE__
#define __HOSTDEVICE__
#define __GLOBAL__
#define INLINE inline
-#define __RESTRICT__ restrict
+#define __RESTRICT__
#endif
// -----------------------------------------------------------------------------
diff --git a/Grains/Base/include/Grains.hh b/Grains/Base/include/Grains.hh
index 1b42c07a..5ab5e7ca 100644
--- a/Grains/Base/include/Grains.hh
+++ b/Grains/Base/include/Grains.hh
@@ -26,14 +26,9 @@ protected:
//@{
/** \brief Parameters used in the simulation on the host memory. */
GrainsParameters m_parameters;
- /** \brief Buffer of particles rigid bodies. The pointer is used because we
- want to use runtime polymorphism for switching between different particle
- types. */
- GrainsMemBuffer*, MemType::HOST> m_particleRigidBodyList;
- /** \brief Buffer of obstacles rigid bodies. The pointer is used because we
- want to use runtime polymorphism for switching between different obstacle
- types. */
- GrainsMemBuffer*, MemType::HOST> m_obstacleRigidBodyList;
+ /** \brief Buffer of rigid bodies. It is of size numComponents where the
+ first numObstacles are obstacles and the rest are particles. */
+ GrainsMemBuffer*, MemType::HOST> m_rigidBodyList;
/** \brief Insertion object. */
std::unique_ptr> m_insertion;
/** \brief Manager of the components in the simulation on the host memory.
diff --git a/Grains/Base/include/GrainsGPU.hh b/Grains/Base/include/GrainsGPU.hh
index 9dc67423..a972b1b9 100644
--- a/Grains/Base/include/GrainsGPU.hh
+++ b/Grains/Base/include/GrainsGPU.hh
@@ -16,10 +16,8 @@ template
class GrainsGPU : public Grains
{
protected:
- /** \brief Memory buffer for particle rigid bodies on the device. */
- GrainsMemBuffer*, MemType::DEVICE> m_d_particleRigidBodyList;
- /** \brief Memory buffer for obstacle rigid bodies on the device. */
- GrainsMemBuffer*, MemType::DEVICE> m_d_obstacleRigidBodyList;
+ /** \brief Memory buffer for rigid bodies on the device. */
+ GrainsMemBuffer*, MemType::DEVICE> m_d_rigidBodyList;
/** \brief Manager of the components in the simulation. We use a pointer
here as we want to use runtime polymorphism for switching between
ComponentManagerCPU and ComponentManagerGPU. */
diff --git a/Grains/Base/include/GrainsMemBuffer.hh b/Grains/Base/include/GrainsMemBuffer.hh
index 97c6d5ca..21570bf0 100644
--- a/Grains/Base/include/GrainsMemBuffer.hh
+++ b/Grains/Base/include/GrainsMemBuffer.hh
@@ -5,7 +5,6 @@
#include "GrainsParameters.hh"
#include "GrainsUtils.hh"
-#include "Misc_Kernels.hh"
enum class MemType
{
@@ -24,6 +23,39 @@ enum class MemType
and data transfer between these spaces.
@author A.Yazdani - 2025 - Construction */
+// =============================================================================
+/** @name GrainsMemBuffer: External Methods */
+//@{
+/** @brief Fills a buffer to default
+ @param buffer the buffer to be initialized
+ @param size size of the buffer
+ @param value the default value to be set (default is T()) */
+template
+__GLOBAL__ void fill_Kernel(T* buffer, const size_t size, const T& value = T())
+{
+ size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if(idx >= size)
+ return;
+ buffer[idx] = value;
+}
+
+// -----------------------------------------------------------------------------
+/** @brief fills a buffer with incremental values
+ @param buffer the buffer to be initialized
+ @param size size of the buffer
+ @param start the starting value (default is 0) */
+template
+__GLOBAL__ void
+ sequence_Kernel(T* buffer, const size_t size, const T& start = T(0))
+{
+ size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if(idx >= size)
+ return;
+
+ buffer[idx] = static_cast(start + idx);
+}
+//@}
+
// =============================================================================
template
class GrainsMemBuffer
@@ -52,7 +84,7 @@ public:
/** @brief Constructor with the size */
GrainsMemBuffer(size_t size)
{
- allocate(size);
+ initialize(size);
fill();
}
@@ -62,7 +94,7 @@ public:
@param value default value to fill the buffer */
GrainsMemBuffer(size_t size, const T& value)
{
- allocate(size);
+ initialize(size);
fill(value);
}
@@ -105,7 +137,11 @@ public:
GrainsMemBuffer& operator=(GrainsMemBuffer&& other) noexcept
{
if(this != &other)
+ {
+ // Free existing storage before taking ownership to avoid leaks
+ free();
moveFrom(other);
+ }
return (*this);
}
//@}
@@ -151,6 +187,13 @@ public:
return m_size * sizeof(T);
}
+ // -------------------------------------------------------------------------
+ /** @brief Returns the capacity (maximum elements without reallocation) */
+ size_t getCapacity() const
+ {
+ return m_capacity;
+ }
+
// -------------------------------------------------------------------------
/** @brief Returns the type of memory */
MemType getMemType() const
@@ -166,6 +209,46 @@ public:
else
return MemType::UNKNOWN;
}
+
+ // -------------------------------------------------------------------------
+ /** @brief Returns iterator to the beginning of the buffer */
+ T* begin()
+ {
+ static_assert(
+ M == MemType::HOST || M == MemType::PINNED || M == MemType::MANAGED,
+ "begin() only available for HOST, PINNED, or MANAGED memory");
+ return m_ptr;
+ }
+
+ // -------------------------------------------------------------------------
+ /** @brief Returns const iterator to the beginning of the buffer */
+ const T* begin() const
+ {
+ static_assert(
+ M == MemType::HOST || M == MemType::PINNED || M == MemType::MANAGED,
+ "begin() only available for HOST, PINNED, or MANAGED memory");
+ return m_ptr;
+ }
+
+ // -------------------------------------------------------------------------
+ /** @brief Returns iterator to the end of the buffer */
+ T* end()
+ {
+ static_assert(
+ M == MemType::HOST || M == MemType::PINNED || M == MemType::MANAGED,
+ "end() only available for HOST, PINNED, or MANAGED memory");
+ return m_ptr + m_size;
+ }
+
+ // -------------------------------------------------------------------------
+ /** @brief Returns const iterator to the end of the buffer */
+ const T* end() const
+ {
+ static_assert(
+ M == MemType::HOST || M == MemType::PINNED || M == MemType::MANAGED,
+ "end() only available for HOST, PINNED, or MANAGED memory");
+ return m_ptr + m_size;
+ }
//@}
/** @name Set methods */
@@ -175,12 +258,9 @@ public:
@param new_size new size of the buffer (must be <= capacity) */
void setSize(size_t new_size)
{
- if(new_size > m_capacity)
- {
- std::cerr
- << "GrainsMemBuffer::setSize() new size exceeds capacity\n";
- return;
- }
+ GAssert(new_size > 0, "Size must be positive in setSize()");
+ GAssert(new_size <= m_capacity,
+ "Size must be <= capacity in setSize()");
m_size = new_size;
}
//@}
@@ -188,101 +268,114 @@ public:
/** @name Methods */
//@{
// -------------------------------------------------------------------------
- /** @brief Allocates memory of the specified type and size
- @param count number of elements */
- void allocate(size_t count)
+ /** @brief Reserves memory for the buffer
+ @param new_capacity new capacity of the buffer */
+ void reserve(size_t new_capacity)
+ {
+ GAssert(new_capacity > 0, "Capacity must be positive in reserve()");
+
+ if(new_capacity <= m_capacity)
+ return;
+
+ // Allocate a new buffer with the requested capacity
+ GrainsMemBuffer new_buf;
+ new_buf.initialize(new_capacity, new_capacity);
+
+ // Copy existing data (only logical size worth of bytes)
+ if(m_ptr && m_size)
+ {
+ if constexpr(M == MemType::HOST || M == MemType::PINNED)
+ {
+ std::memcpy(new_buf.getData(), m_ptr, m_size * sizeof(T));
+ }
+ else if constexpr(M == MemType::DEVICE || M == MemType::MANAGED)
+ {
+ // Device-to-device copy for device/managed memory
+ cudaErrCheck(cudaMemcpy(new_buf.getData(),
+ m_ptr,
+ m_size * sizeof(T),
+ cudaMemcpyDeviceToDevice));
+ }
+ }
+
+ // Preserve logical size; capacity is already new_capacity
+ new_buf.m_size = m_size;
+
+ // Replace current storage; move assignment frees old storage now
+ *this = std::move(new_buf);
+ }
+
+ // -------------------------------------------------------------------------
+ /** @brief Initialize/reinitialize buffer with specific size and capacity
+ @param new_size desired logical size
+ @param new_capacity desired capacity (if 0, uses new_size)
+ This is useful for buffer initialization or complete reallocation */
+ void initialize(size_t new_size, size_t new_capacity = 0)
{
- m_size = count;
- m_capacity = count;
+ if(new_capacity == 0)
+ new_capacity = new_size;
+
+ GAssert(new_capacity >= new_size,
+ "Capacity must be >= size in initialize()");
+
+ // Free existing memory
+ free();
+
+ // Allocate with desired capacity
+ m_capacity = new_capacity;
+ m_size = new_size;
if constexpr(M == MemType::HOST)
{
- m_ptr = static_cast(std::malloc(sizeof(T) * count));
+ m_ptr = static_cast(std::malloc(sizeof(T) * new_capacity));
if(!m_ptr)
throw std::bad_alloc();
}
else if constexpr(M == MemType::DEVICE)
{
- cudaErrCheck(cudaMalloc(&m_ptr, sizeof(T) * count));
+ cudaErrCheck(cudaMalloc(&m_ptr, sizeof(T) * new_capacity));
}
else if constexpr(M == MemType::MANAGED)
{
- cudaErrCheck(cudaMallocManaged(&m_ptr, sizeof(T) * count));
+ cudaErrCheck(cudaMallocManaged(&m_ptr, sizeof(T) * new_capacity));
}
else if constexpr(M == MemType::PINNED)
{
- cudaErrCheck(cudaMallocHost(&m_ptr, sizeof(T) * count));
+ cudaErrCheck(cudaMallocHost(&m_ptr, sizeof(T) * new_capacity));
}
}
// -------------------------------------------------------------------------
- /** @brief Reserves memory for the buffer
- @param new_capacity new capacity of the buffer */
- void reserve(size_t new_capacity)
- {
- if(new_capacity <= m_capacity)
- return;
-
- m_capacity = new_capacity;
- GrainsMemBuffer new_buf;
- new_buf.allocate(new_capacity);
-
- cudaMemcpyKind kind = getMemcpyKind();
-
- cudaErrCheck(cudaMemcpy(new_buf.getDeviceData(),
- getDeviceData(),
- m_size * sizeof(T),
- kind));
-
- *this = std::move(new_buf);
- }
-
- // -------------------------------------------------------------------------
- /** @brief Resizes the buffer
+ /** @brief Resizes the buffer (changes logical size, may grow capacity)
@param new_size new size of the buffer */
void resize(size_t new_size)
{
+ // If new size fits within current capacity, just change size
if(new_size <= m_capacity)
{
m_size = new_size;
return;
}
- T* old_data = m_ptr;
- size_t old_size = m_size;
- allocate(new_size);
+ // Need to grow capacity - preserve existing data
+ reserve(new_size);
+ m_size = new_size;
+ }
- if(old_data)
- {
- if constexpr(M == MemType::HOST)
- {
- std::memcpy(m_ptr, old_data, old_size * sizeof(T));
- std::free(old_data);
- }
- else if constexpr(M == MemType::DEVICE)
- {
- cudaErrCheck(cudaMemcpy(m_ptr,
- old_data,
- old_size * sizeof(T),
- cudaMemcpyDeviceToDevice));
- cudaErrCheck(cudaFree(old_data));
- }
- else if constexpr(M == MemType::PINNED)
- {
- std::memcpy(m_ptr, old_data, old_size * sizeof(T));
- cudaErrCheck(cudaFreeHost(old_data));
- }
- else if constexpr(M == MemType::MANAGED)
- {
- cudaErrCheck(cudaMemcpy(m_ptr,
- old_data,
- old_size * sizeof(T),
- cudaMemcpyDeviceToDevice));
- cudaErrCheck(cudaFree(old_data));
- }
- }
- m_size = new_size;
- m_capacity = new_size;
+ // -------------------------------------------------------------------------
+ /** @brief Clears the buffer (sets size to 0, keeps capacity)
+ Useful when you want to "empty" the buffer but keep memory allocated */
+ void clear()
+ {
+ m_size = 0;
+ }
+
+ // -------------------------------------------------------------------------
+ /** @brief Resets the buffer completely (frees memory, size=0, capacity=0)
+ Useful when you want to completely reinitialize the buffer */
+ void reset()
+ {
+ free();
}
// -------------------------------------------------------------------------
@@ -290,18 +383,20 @@ public:
@param value value to push */
void push_back(const T& value)
{
- if constexpr(M == MemType::HOST || M == MemType::PINNED)
+ if constexpr(M == MemType::HOST || M == MemType::PINNED
+ || M == MemType::MANAGED)
{
+ // Grow capacity if needed (like std::vector)
if(m_size >= m_capacity)
{
- std::cerr << "GrainsMemBuffer::push_back overflow\n";
- return;
+ size_t new_capacity = m_capacity == 0 ? 1 : m_capacity * 2;
+ reserve(new_capacity);
}
m_ptr[m_size++] = value;
}
else
- std::cerr << "GrainsMemBuffer::push_back() only allowed on host or "
- "pinned memory\n";
+ std::cerr << "GrainsMemBuffer::push_back() only allowed on host, "
+ "pinned, or managed memory\n";
}
// -------------------------------------------------------------------------
@@ -310,20 +405,24 @@ public:
@param count number of elements to push */
void push_bulk(const T* values, size_t count)
{
- if constexpr(M == MemType::HOST || M == MemType::PINNED)
+ if constexpr(M == MemType::HOST || M == MemType::PINNED
+ || M == MemType::MANAGED)
{
+ // Grow capacity if needed
if(m_size + count > m_capacity)
{
- std::cerr << "GrainsMemBuffer::push_bulk overflow\n";
- return;
+ size_t new_capacity
+ = std::max(m_capacity == 0 ? 1 : m_capacity * 2,
+ m_size + count);
+ reserve(new_capacity);
}
T* dst = m_ptr + m_size;
std::memcpy(dst, values, count * sizeof(T));
m_size += count;
}
else
- std::cerr << "GrainsMemBuffer::push_bulk() only allowed on host or "
- "pinned memory\n";
+ std::cerr << "GrainsMemBuffer::push_bulk() only allowed on host, "
+ "pinned, or managed memory\n";
}
// -------------------------------------------------------------------------
@@ -334,7 +433,7 @@ public:
return;
GrainsMemBuffer new_buf{};
- new_buf.allocate(m_size);
+ new_buf.initialize(m_size, m_size);
if constexpr(M == MemType::HOST || M == MemType::PINNED)
std::memcpy(new_buf.m_ptr, m_ptr, m_size * sizeof(T));
@@ -400,8 +499,8 @@ public:
if(m_size == 0 || !m_ptr)
return;
- if(dest.getSize() < m_size)
- GAbort("Destination buffer too small for copy");
+ GAssert(dest.getSize() >= m_size,
+ "Destination buffer too small for copy");
if constexpr(M == MemType::HOST && destM == MemType::HOST)
std::memcpy(dest.getData(), m_ptr, getBytes());
@@ -444,6 +543,21 @@ public:
}
}
+ // -------------------------------------------------------------------------
+ /** @brief at method to access elements with bounds checking
+ @param index index of the element to access */
+ const T& at(size_t index) const
+ {
+ static_assert(M == MemType::HOST || M == MemType::PINNED,
+ "at() only available for HOST or PINNED memory");
+ GAssert(index < m_size,
+ "Index",
+ index,
+ "out of bounds for size",
+ m_size);
+ return m_ptr[index];
+ }
+
// -------------------------------------------------------------------------
/** @brief Frees the allocated memory */
void free()
@@ -517,20 +631,23 @@ public:
}
// -------------------------------------------------------------------------
- /** @brief Fills the buffer incrementally with values starting from 0 */
- void fillIncremental()
+ /** @brief Fills the buffer incrementally with values starting from a given
+ value
+ @param start the starting value (default is 0) */
+ void sequence(const T& start = T(0))
{
if constexpr(M == MemType::HOST || M == MemType::PINNED)
{
for(size_t i = 0; i < m_size; ++i)
- m_ptr[i] = static_cast(i);
+ m_ptr[i] = static_cast(start + i);
}
else if constexpr(M == MemType::DEVICE || M == MemType::MANAGED)
{
static_assert(std::is_fundamental::value,
"T must be a primitive type for device init");
- fillIncremental_Kernel<<<(m_size + 255) / 256, 256>>>(m_ptr,
- m_size);
+ sequence_Kernel<<<(m_size + 255) / 256, 256>>>(m_ptr,
+ m_size,
+ start);
cudaDeviceSynchronize();
}
}
@@ -542,9 +659,9 @@ public:
if constexpr(M == MemType::HOST || M == MemType::PINNED)
{
if(!label.empty())
- std::cout << label << ": ";
+ std::cout << label << ": " << "\n";
for(size_t i = 0; i < m_size; ++i)
- std::cout << m_ptr[i] << " ";
+ std::cout << "[" << i << "]. " << m_ptr[i] << "\n";
std::cout << std::endl;
}
else if constexpr(M == MemType::DEVICE || M == MemType::MANAGED)
@@ -556,9 +673,9 @@ public:
getBytes(),
cudaMemcpyDeviceToHost));
if(!label.empty())
- std::cout << label << ": ";
+ std::cout << label << ": " << "\n";
for(size_t i = 0; i < m_size; ++i)
- std::cout << hostBuf[i] << " ";
+ std::cout << "[" << i << "]. " << hostBuf[i] << "\n";
std::cout << std::endl;
}
else
@@ -578,7 +695,7 @@ public:
{
static_assert(M == MemType::HOST || M == MemType::PINNED,
"operator[] only available for HOST or PINNED memory");
- assert(i < m_size);
+ GAssert(i < m_size, "Index", i, "out of bounds for size", m_size);
return m_ptr[i];
}
@@ -589,7 +706,7 @@ public:
{
static_assert(M == MemType::HOST || M == MemType::PINNED,
"operator[] only available for HOST or PINNED memory");
- assert(i < m_size);
+ GAssert(i < m_size, "Index", i, "out of bounds for size", m_size);
return m_ptr[i];
}
diff --git a/Grains/Base/include/GrainsParameters.hh b/Grains/Base/include/GrainsParameters.hh
index 02d7d5b0..aa326d0c 100644
--- a/Grains/Base/include/GrainsParameters.hh
+++ b/Grains/Base/include/GrainsParameters.hh
@@ -3,6 +3,65 @@
#include "Vector3.hh"
+/** @brief Type of neighbor list */
+enum class NeighborListType
+{
+ /** @brief N-Squared neighbor list */
+ NSQ = 0,
+ /** @brief Linked cell neighbor list */
+ LINKEDCELL = 1
+};
+
+/** @brief Type of linked cell */
+enum class LinkedCellType
+{
+ /** @brief Host linked cells */
+ HOST = 0,
+ /** @brief Sort-based linked cells for device */
+ SORTBASED = 1,
+ /** @brief Atomic linked cells for device */
+ ATOMIC = 2
+};
+
+/** @brief Type of bounding volume */
+enum class BoundingVolumeType
+{
+ /** @brief No bounding volume */
+ OFF = 0,
+ /** @brief Oriented Bounding Box */
+ OBB = 1,
+ /** @brief Oriented Bounding Cylinder */
+ OBC = 2
+};
+
+/** @brief Type of narrow-phase detection */
+enum class NarrowPhaseType
+{
+ /** @brief Gilbert-Johnson-Keerthi algorithm */
+ GJK = 0
+};
+
+/** @brief Parameters for linked cell configuration */
+template
+struct LinkedCellParameters
+{
+ /** \brief Minimum corner of the linked cell domain */
+ Vector3 minCorner = Vector3(0, 0, 0);
+ /** \brief Maximum corner of the linked cell domain */
+ Vector3 maxCorner = Vector3(0, 0, 0);
+ /** \brief Type of linked cell */
+ LinkedCellType type = LinkedCellType::HOST;
+ /** \brief Linked cell size factor */
+ T cellSizeFactor = 1;
+ /** \brief If using adaptive skin, this is the desired number of
+ iterations that the skin should be valid for.
+ If it is set to 0, then we don't use adaptive skin. */
+ uint updateFrequency = 1;
+ /** \brief If using Morton ordering, this is the number of iterations
+ between each sorting */
+ uint sortFrequency = 0;
+};
+
// =============================================================================
/** @brief Parameters needed for Grains.
@@ -37,8 +96,6 @@ public:
static uint m_numParticles;
/** @brief Number of obstacles in simulation */
static uint m_numObstacles;
- /** @brief Maximum radius among all particles */
- static T m_maxRadius;
/* Physical */
/** \brief Gravity vector */
@@ -61,22 +118,20 @@ public:
static cudaDeviceProp m_GPU;
/* Collision Detection */
- /** \brief Type of neighbor list */
- static uint m_neighborListType;
- /** \brief Frequency of updating neighbor list */
- static uint m_neighborListFrequency;
- /** \brief Type of linked cell */
- static uint m_linkedCellType;
- /** \brief Linked cell size factor */
- static uint m_linkedCellSizeFactor;
- /** \brief Frequency of sorting particles */
- static uint m_sortingFrequency;
- /** @brief Number of cells in linked cell */
- static uint m_numCells;
- /** \brief Type of bounding volume */
- static uint m_boundingVolumeType;
- /** \brief Type of narrow-phase detection */
- static uint m_narrowPhaseType;
+ struct CollisionDetectionParameters
+ {
+ /** \brief Type of neighbor list */
+ NeighborListType neighborListType = NeighborListType::NSQ;
+ /** \brief LinkedCell parameters */
+ LinkedCellParameters linkedCellParameters;
+ /** \brief Type of bounding volume */
+ BoundingVolumeType boundingVolumeType = BoundingVolumeType::OFF;
+ /** \brief Type of narrow-phase detection */
+ NarrowPhaseType narrowPhaseType = NarrowPhaseType::GJK;
+ };
+
+ /** \brief Collision detection parameters */
+ static CollisionDetectionParameters m_collisionDetection;
//@}
};
diff --git a/Grains/Base/include/GrainsUtils.hh b/Grains/Base/include/GrainsUtils.hh
index d22b055a..0f8e53cc 100644
--- a/Grains/Base/include/GrainsUtils.hh
+++ b/Grains/Base/include/GrainsUtils.hh
@@ -3,6 +3,7 @@
#include "Basic.hh"
#include "Vector3.hh"
+#include
// =============================================================================
/** @brief Miscellaneous functionalities (mostly low-level) for Grains.
@@ -34,6 +35,25 @@ __HOST__ static INLINE void
}
}
+// -----------------------------------------------------------------------------
+/** @brief Returns the available memory on the host in bytes */
+__HOST__ static INLINE size_t getAvailableHostMemory()
+{
+ long pages = sysconf(_SC_AVPHYS_PAGES);
+ long page_size = sysconf(_SC_PAGE_SIZE);
+ return pages * page_size;
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Returns the available memory on the device in bytes */
+__HOST__ static INLINE size_t getAvailableDeviceMemory()
+{
+ size_t free_byte;
+ size_t total_byte;
+ cudaErrCheck(cudaMemGetInfo(&free_byte, &total_byte));
+ return free_byte;
+}
+
// -----------------------------------------------------------------------------
/** @brief computes the optimal number of threads and blocks for a given number
of elements and an architecture
@@ -123,7 +143,7 @@ __HOST__ static constexpr INLINE std::string
{
std::ostringstream oss;
oss << vec;
- return ("[" + oss.str() + "]");
+ return (oss.str());
}
// -----------------------------------------------------------------------------
@@ -155,12 +175,46 @@ __HOST__ INLINE void GoutWI(const int numShift, const Args&... args)
/** @brief Writes a message to stdout with Indent (WI)
@param numShift the number of shift characters at the beginning
@param args the output messages */
+// Helper functions for device printf with different types
+__DEVICE__ INLINE void print_device_arg(const char* arg)
+{
+ printf("%s ", arg);
+}
+__DEVICE__ INLINE void print_device_arg(char* arg)
+{
+ printf("%s ", arg);
+}
+__DEVICE__ INLINE void print_device_arg(size_t arg)
+{
+ printf("%zu ", arg);
+}
+__DEVICE__ INLINE void print_device_arg(int arg)
+{
+ printf("%d ", arg);
+}
+__DEVICE__ INLINE void print_device_arg(uint arg)
+{
+ printf("%u ", arg);
+}
+__DEVICE__ INLINE void print_device_arg(long arg)
+{
+ printf("%ld ", arg);
+}
+__DEVICE__ INLINE void print_device_arg(float arg)
+{
+ printf("%f ", arg);
+}
+__DEVICE__ INLINE void print_device_arg(double arg)
+{
+ printf("%f ", arg);
+}
+
template
__HOSTDEVICE__ INLINE void GAbort(const Args&... args)
{
#ifdef __CUDA_ARCH__
printf("[DEVICE] ");
- (printf("%s ", args), ...);
+ (print_device_arg(args), ...);
printf("\n");
__trap(); // aborts the kernel
#else
@@ -171,4 +225,15 @@ __HOSTDEVICE__ INLINE void GAbort(const Args&... args)
#endif
}
+// -----------------------------------------------------------------------------
+/** @brief Assert function that aborts the program if the condition is false
+ @param condition the condition to check
+ @param args the message(s) to display if the assertion fails */
+template
+__HOSTDEVICE__ INLINE void GAssert(bool condition, const Args&... args)
+{
+ if(!condition)
+ GAbort("GAssert failed:", args...);
+}
+
#endif
\ No newline at end of file
diff --git a/Grains/Base/include/MatrixMath.hh b/Grains/Base/include/MatrixMath.hh
index 098c1a56..ce72911f 100644
--- a/Grains/Base/include/MatrixMath.hh
+++ b/Grains/Base/include/MatrixMath.hh
@@ -16,73 +16,202 @@
// =============================================================================
/** @name Matrix3 math functions and operators */
//@{
-/** @brief Returns the determinant of the matrix
-@param m the matrix */
+/** @brief Matrix absolute
+ @param m the matrix */
+template
+__HOSTDEVICE__ static INLINE Matrix3 fabs(const Matrix3& m) noexcept
+{
+ const T* __RESTRICT__ b = m.getBuffer();
+ return (Matrix3(fabs(b[XX]),
+ fabs(b[XY]),
+ fabs(b[XZ]),
+ fabs(b[YX]),
+ fabs(b[YY]),
+ fabs(b[YZ]),
+ fabs(b[ZX]),
+ fabs(b[ZY]),
+ fabs(b[ZZ])));
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix absolute in-place
+ @param m the matrix */
+template
+__HOSTDEVICE__ static INLINE void fabs(Matrix3& m) noexcept
+{
+ T* __RESTRICT__ b = const_cast(m.getBuffer());
+ b[XX] = fabs(b[XX]);
+ b[XY] = fabs(b[XY]);
+ b[XZ] = fabs(b[XZ]);
+ b[YX] = fabs(b[YX]);
+ b[YY] = fabs(b[YY]);
+ b[YZ] = fabs(b[YZ]);
+ b[ZX] = fabs(b[ZX]);
+ b[ZY] = fabs(b[ZY]);
+ b[ZZ] = fabs(b[ZZ]);
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix determinant
+ @param m the matrix */
template
__HOSTDEVICE__ static INLINE T determinant(const Matrix3& m) noexcept
{
- T const* __RESTRICT__ buffer = m.getBuffer();
- T out0 = buffer[XX] * (buffer[YY] * buffer[ZZ] - buffer[YZ] * buffer[ZY]);
- T out1 = buffer[XY] * (buffer[YZ] * buffer[ZX] - buffer[YX] * buffer[ZZ]);
- T out2 = buffer[XZ] * (buffer[YX] * buffer[ZY] - buffer[YY] * buffer[ZX]);
+ const T* __RESTRICT__ b = m.getBuffer();
+ T out0 = b[XX] * (b[YY] * b[ZZ] - b[YZ] * b[ZY]);
+ T out1 = b[XY] * (b[YZ] * b[ZX] - b[YX] * b[ZZ]);
+ T out2 = b[XZ] * (b[YX] * b[ZY] - b[YY] * b[ZX]);
return (out0 + out1 + out2);
}
// -----------------------------------------------------------------------------
-/** @brief Returns the transposed matrix
-@param m the matrix */
+/** @brief Matrix transposition
+ @param m the matrix */
template
__HOSTDEVICE__ static INLINE Matrix3 transpose(const Matrix3& m) noexcept
{
- T const* __RESTRICT__ buffer = m.getBuffer();
- return (Matrix3(buffer[XX],
- buffer[YX],
- buffer[ZX],
- buffer[XY],
- buffer[YY],
- buffer[ZY],
- buffer[XZ],
- buffer[YZ],
- buffer[ZZ]));
+ const T* __RESTRICT__ b = m.getBuffer();
+ return (Matrix3<
+ T>(b[XX], b[YX], b[ZX], b[XY], b[YY], b[ZY], b[XZ], b[YZ], b[ZZ]));
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix transposition in-place
+ @param m the matrix */
+template
+__HOSTDEVICE__ static INLINE void transpose(Matrix3& m) noexcept
+{
+ T* __RESTRICT__ b = const_cast(m.getBuffer());
+ T temp[3];
+ temp[0] = b[XY];
+ temp[1] = b[XZ];
+ temp[2] = b[YZ];
+ b[XY] = b[YX];
+ b[XZ] = b[ZX];
+ b[YZ] = b[ZY];
+ b[YX] = temp[0];
+ b[ZX] = temp[1];
+ b[ZY] = temp[2];
}
// -----------------------------------------------------------------------------
-/** @brief Returns the inverse of the matrix
-@param m the matrix */
+/** @brief Matrix inverse
+ @param m the matrix */
template
__HOSTDEVICE__ static INLINE Matrix3 inverse(const Matrix3& m) noexcept
{
- T const* __RESTRICT__ buffer = m.getBuffer();
+ const T* __RESTRICT__ b = m.getBuffer();
T __RESTRICT__ out[9];
- out[XX] = (buffer[YY] * buffer[ZZ] - buffer[YZ] * buffer[ZY]);
- out[YX] = (buffer[YZ] * buffer[ZX] - buffer[YX] * buffer[ZZ]);
- out[ZX] = (buffer[YX] * buffer[ZY] - buffer[YY] * buffer[ZX]);
- T det = buffer[XX] * out[XX] + buffer[XY] * out[YX] + buffer[XZ] * out[ZX];
- if(fabs(det) < HIGHEPS)
+
+ // Calculate cofactor matrix
+ out[XX] = (b[YY] * b[ZZ] - b[YZ] * b[ZY]);
+ out[XY] = (b[XZ] * b[ZY] - b[XY] * b[ZZ]);
+ out[XZ] = (b[XY] * b[YZ] - b[XZ] * b[YY]);
+ out[YX] = (b[YZ] * b[ZX] - b[YX] * b[ZZ]);
+ out[YY] = (b[XX] * b[ZZ] - b[XZ] * b[ZX]);
+ out[YZ] = (b[XZ] * b[YX] - b[XX] * b[YZ]);
+ out[ZX] = (b[YX] * b[ZY] - b[YY] * b[ZX]);
+ out[ZY] = (b[XY] * b[ZX] - b[XX] * b[ZY]);
+ out[ZZ] = (b[XX] * b[YY] - b[XY] * b[YX]);
+
+ // Calculate determinant
+ T det = b[XX] * out[XX] + b[XY] * out[YX] + b[XZ] * out[ZX];
+ if(fabs(det) < EPS)
printf("Matrix is not inversible!\n");
- T s = T(1) / det;
- out[ZZ] = s * (out[XX]);
- out[XY] = s * (buffer[XZ] * buffer[ZY] - buffer[XY] * buffer[ZZ]);
- out[XZ] = s * (buffer[XY] * buffer[YZ] - buffer[XZ] * buffer[YY]);
- out[YX] = s * (out[XZ]);
- out[YY] = s * (buffer[XX] * buffer[ZZ] - buffer[XZ] * buffer[ZX]);
- out[YZ] = s * (buffer[XZ] * buffer[YY] - buffer[XX] * buffer[YZ]);
- out[ZX] = s * (out[ZX]);
- out[ZY] = s * (buffer[XY] * buffer[ZX] - buffer[XX] * buffer[ZY]);
- out[ZZ] = s * (buffer[XX] * buffer[YY] - buffer[XY] * buffer[YX]);
+
+ // Scale by inverse determinant
+ T s = T(1) / det;
+ for(int i = 0; i < 9; ++i)
+ out[i] *= s;
+
return (Matrix3(out));
}
// -----------------------------------------------------------------------------
-/** @brief Matrices addition
-@param m1 first matrix
-@param m2 second matrix */
+/** @brief Matrix inverse in-place
+ @param m the matrix */
+template
+__HOSTDEVICE__ static INLINE void inverse(Matrix3& m) noexcept
+{
+ T* __RESTRICT__ b = const_cast(m.getBuffer());
+ T __RESTRICT__ out[9];
+
+ // Calculate cofactor matrix
+ out[XX] = (b[YY] * b[ZZ] - b[YZ] * b[ZY]);
+ out[XY] = (b[XZ] * b[ZY] - b[XY] * b[ZZ]);
+ out[XZ] = (b[XY] * b[YZ] - b[XZ] * b[YY]);
+ out[YX] = (b[YZ] * b[ZX] - b[YX] * b[ZZ]);
+ out[YY] = (b[XX] * b[ZZ] - b[XZ] * b[ZX]);
+ out[YZ] = (b[XZ] * b[YX] - b[XX] * b[YZ]);
+ out[ZX] = (b[YX] * b[ZY] - b[YY] * b[ZX]);
+ out[ZY] = (b[XY] * b[ZX] - b[XX] * b[ZY]);
+ out[ZZ] = (b[XX] * b[YY] - b[XY] * b[YX]);
+
+ // Calculate determinant
+ T det = b[XX] * out[XX] + b[XY] * out[YX] + b[XZ] * out[ZX];
+ if(fabs(det) < EPS)
+ printf("Matrix is not inversible!\n");
+
+ // Scale by inverse determinant
+ T s = T(1) / det;
+ for(int i = 0; i < 9; ++i)
+ out[i] *= s;
+
+ m.setValue(out);
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix scale
+ @param m the matrix
+ @param v the vector */
+template
+__HOSTDEVICE__ static INLINE Matrix3 scale(const Matrix3& m,
+ const Vector3& v) noexcept
+{
+ const T* __RESTRICT__ b1 = m.getBuffer();
+ const T* __RESTRICT__ b2 = v.getBuffer();
+ return Matrix3(b1[XX] * b2[0],
+ b1[XY] * b2[1],
+ b1[XZ] * b2[2],
+ b1[YX] * b2[0],
+ b1[YY] * b2[1],
+ b1[YZ] * b2[2],
+ b1[ZX] * b2[0],
+ b1[ZY] * b2[1],
+ b1[ZZ] * b2[2]);
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix scale in-place
+ @param m the matrix
+ @param v the vector */
+template
+__HOSTDEVICE__ static INLINE void scale(Matrix3& m,
+ const Vector3& v) noexcept
+{
+ T* __RESTRICT__ b1 = const_cast(m.getBuffer());
+ const T* __RESTRICT__ b2 = v.getBuffer();
+ b1[XX] *= b2[0];
+ b1[XY] *= b2[1];
+ b1[XZ] *= b2[2];
+ b1[YX] *= b2[0];
+ b1[YY] *= b2[1];
+ b1[YZ] *= b2[2];
+ b1[ZX] *= b2[0];
+ b1[ZY] *= b2[1];
+ b1[ZZ] *= b2[2];
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix addition
+ @param m1 first matrix
+ @param m2 second matrix */
template
__HOSTDEVICE__ static INLINE Matrix3 operator+(const Matrix3& m1,
const Matrix3& m2) noexcept
{
- T const* __RESTRICT__ b1 = m1.getBuffer();
- T const* __RESTRICT__ b2 = m2.getBuffer();
+ const T* __RESTRICT__ b1 = m1.getBuffer();
+ const T* __RESTRICT__ b2 = m2.getBuffer();
T __RESTRICT__ out[9];
for(uint i = 0; i < 9; ++i)
out[i] = b1[i] + b2[i];
@@ -90,70 +219,142 @@ __HOSTDEVICE__ static INLINE Matrix3 operator+(const Matrix3& m1,
}
// -----------------------------------------------------------------------------
-/** @brief Matrices subtraction
-@param m1 first matrix
-@param m2 second matrix */
+/** @brief Matrix addition in-place
+ @param m1 first matrix
+ @param m2 second matrix */
+template
+__HOSTDEVICE__ static INLINE void operator+=(Matrix3& m1,
+ const Matrix3& m2) noexcept
+{
+ T* __RESTRICT__ b1 = const_cast(m1.getBuffer());
+ const T* __RESTRICT__ b2 = m2.getBuffer();
+ for(uint i = 0; i < 9; ++i)
+ b1[i] += b2[i];
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix subtraction
+ @param m1 first matrix
+ @param m2 second matrix */
template
__HOSTDEVICE__ static INLINE Matrix3 operator-(const Matrix3& m1,
const Matrix3& m2) noexcept
{
- T const* __RESTRICT__ b1 = m1.getBuffer();
- T const* __RESTRICT__ b2 = m2.getBuffer();
+ const T* __RESTRICT__ b1 = m1.getBuffer();
+ const T* __RESTRICT__ b2 = m2.getBuffer();
T __RESTRICT__ out[9];
for(uint i = 0; i < 9; ++i)
out[i] = b1[i] - b2[i];
return (Matrix3(out));
}
+// -----------------------------------------------------------------------------
+/** @brief Matrix subtraction in-place
+ @param m1 first matrix
+ @param m2 second matrix */
+template
+__HOSTDEVICE__ static INLINE void operator-=(Matrix3& m1,
+ const Matrix3& m2) noexcept
+{
+ T* __RESTRICT__ b1 = const_cast(m1.getBuffer());
+ const T* __RESTRICT__ b2 = m2.getBuffer();
+ for(uint i = 0; i < 9; ++i)
+ b1[i] -= b2[i];
+}
+
// -----------------------------------------------------------------------------
/** @brief Scalar-matrix product
-@param c the scalar
-@param m the matrix */
+ @param c the scalar
+ @param m the matrix */
template
__HOSTDEVICE__ static INLINE Matrix3 operator*(T c,
const Matrix3& m) noexcept
{
- T const* __RESTRICT__ buffer = m.getBuffer();
+ const T* __RESTRICT__ b = m.getBuffer();
T __RESTRICT__ out[9];
for(uint i = 0; i < 9; ++i)
- out[i] = c * buffer[i];
+ out[i] = c * b[i];
return (Matrix3(out));
}
+// -----------------------------------------------------------------------------
+/** @brief Scalar-matrix product in-place
+ @param c the scalar
+ @param m the matrix */
+template
+__HOSTDEVICE__ static INLINE void operator*=(Matrix3& m, T c) noexcept
+{
+ T* __RESTRICT__ b = const_cast(m.getBuffer());
+ for(uint i = 0; i < 9; ++i)
+ b[i] *= c;
+}
+
// -----------------------------------------------------------------------------
/** @brief Matrix-vector product
-@param m the matrix
-@param v the vector */
+ @param m the matrix
+ @param v the vector */
template
__HOSTDEVICE__ static INLINE Vector3 operator*(const Matrix3& m,
const Vector3& v) noexcept
{
- T const* __RESTRICT__ bufferM = m.getBuffer();
- T const* __RESTRICT__ bufferV = v.getBuffer();
+ const T* __RESTRICT__ b1 = m.getBuffer();
+ const T* __RESTRICT__ b2 = v.getBuffer();
T __RESTRICT__ out[3];
for(uint i = 0; i < 3; ++i)
- out[i] = bufferM[3 * i] * bufferV[0] + bufferM[3 * i + 1] * bufferV[1]
- + bufferM[3 * i + 2] * bufferV[2];
+ out[i]
+ = b1[3 * i] * b2[0] + b1[3 * i + 1] * b2[1] + b1[3 * i + 2] * b2[2];
return (Vector3(out));
}
+// -----------------------------------------------------------------------------
+/** @brief Matrix-vector product in-place. Note that this modifies the vector.
+ @param m the matrix
+ @param v the vector */
+template
+__HOSTDEVICE__ static INLINE void operator*=(const Matrix3& m,
+ Vector3& v) noexcept
+{
+ const T* __RESTRICT__ b1 = m.getBuffer();
+ T* __RESTRICT__ b2 = const_cast(v.getBuffer());
+ T __RESTRICT__ out[3];
+ for(uint i = 0; i < 3; ++i)
+ out[i]
+ = b1[3 * i] * b2[0] + b1[3 * i + 1] * b2[1] + b1[3 * i + 2] * b2[2];
+ v.setValue(out);
+}
+
// -----------------------------------------------------------------------------
/** @brief Vector-matrix product
-@param v the vector
-@param m the matrix */
+ @param v the vector
+ @param m the matrix */
template
__HOSTDEVICE__ static INLINE Vector3 operator*(const Vector3& v,
const Matrix3& m) noexcept
{
- T const* __RESTRICT__ bufferV = v.getBuffer();
- T const* __RESTRICT__ bufferM = m.getBuffer();
+ const T* __RESTRICT__ b1 = v.getBuffer();
+ const T* __RESTRICT__ b2 = m.getBuffer();
T __RESTRICT__ out[3];
for(uint i = 0; i < 3; ++i)
- out[i] = bufferM[i] * bufferV[0] + bufferM[i + 3] * bufferV[1]
- + bufferM[i + 6] * bufferV[2];
+ out[i] = b2[i] * b1[0] + b2[i + 3] * b1[1] + b2[i + 6] * b1[2];
return (Vector3(out));
}
+// -----------------------------------------------------------------------------
+/** @brief Vector-matrix product in-place. Note that this modifies the vector.
+ @param v the vector
+ @param m the matrix */
+template
+__HOSTDEVICE__ static INLINE void operator*=(Vector3& v,
+ const Matrix3& m) noexcept
+{
+ T* __RESTRICT__ b1 = const_cast(v.getBuffer());
+ T const* __RESTRICT__ b2 = m.getBuffer();
+ T out[3];
+ for(uint i = 0; i < 3; ++i)
+ out[i] = b1[0] * b2[i] + b1[1] * b2[i + 3] + b1[2] * b2[i + 6];
+ v.setValue(out);
+}
+
// -----------------------------------------------------------------------------
/** @brief Matrix-matrix product
@param m right matrix */
@@ -173,6 +374,115 @@ __HOSTDEVICE__ static INLINE Matrix3 operator*(const Matrix3& m1,
b1[ZX] * b2[XY] + b1[ZY] * b2[YY] + b1[ZZ] * b2[ZY],
b1[ZX] * b2[XZ] + b1[ZY] * b2[YZ] + b1[ZZ] * b2[ZZ]));
}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix-matrix product in-place
+ @param m1 left matrix
+ @param m2 right matrix */
+template
+__HOSTDEVICE__ static INLINE void operator*=(Matrix3& m1,
+ const Matrix3& m2) noexcept
+{
+ T* __RESTRICT__ b1 = const_cast(m1.getBuffer());
+ const T* __RESTRICT__ b2 = m2.getBuffer();
+ T __RESTRICT__ out[9];
+ out[XX] = b1[XX] * b2[XX] + b1[XY] * b2[YX] + b1[XZ] * b2[ZX];
+ out[XY] = b1[XX] * b2[XY] + b1[XY] * b2[YY] + b1[XZ] * b2[ZY];
+ out[XZ] = b1[XX] * b2[XZ] + b1[XY] * b2[YZ] + b1[XZ] * b2[ZZ];
+ out[YX] = b1[YX] * b2[XX] + b1[YY] * b2[YX] + b1[YZ] * b2[ZX];
+ out[YY] = b1[YX] * b2[XY] + b1[YY] * b2[YY] + b1[YZ] * b2[ZY];
+ out[YZ] = b1[YX] * b2[XZ] + b1[YY] * b2[YZ] + b1[YZ] * b2[ZZ];
+ out[ZX] = b1[ZX] * b2[XX] + b1[ZY] * b2[YX] + b1[ZZ] * b2[ZX];
+ out[ZY] = b1[ZX] * b2[XY] + b1[ZY] * b2[YY] + b1[ZZ] * b2[ZY];
+ out[ZZ] = b1[ZX] * b2[XZ] + b1[ZY] * b2[YZ] + b1[ZZ] * b2[ZZ];
+ m1.setValue(out);
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix sign flip
+ @param m the matrix */
+template
+__HOSTDEVICE__ static INLINE Matrix3 operator-(const Matrix3& m) noexcept
+{
+ const T* __RESTRICT__ b = m.getBuffer();
+ return (Matrix3(-b[XX],
+ -b[XY],
+ -b[XZ],
+ -b[YX],
+ -b[YY],
+ -b[YZ],
+ -b[ZX],
+ -b[ZY],
+ -b[ZZ]));
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix equality comparison
+ @param m1 first matrix
+ @param m2 second matrix */
+template
+__HOSTDEVICE__ static INLINE bool operator==(const Matrix3& m1,
+ const Matrix3& m2) noexcept
+{
+ const T* __RESTRICT__ b1 = m1.getBuffer();
+ const T* __RESTRICT__ b2 = m2.getBuffer();
+ for(int i = 0; i < 9; ++i)
+ {
+ if(fabs(b1[i] - b2[i]) > EPS)
+ return false;
+ }
+ return true;
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix inequality operator
+ @param m1 first matrix
+ @param m2 second matrix */
+template
+__HOSTDEVICE__ static INLINE bool operator!=(const Matrix3& m1,
+ const Matrix3& m2) noexcept
+{
+ const T* __RESTRICT__ b1 = m1.getBuffer();
+ const T* __RESTRICT__ b2 = m2.getBuffer();
+ for(int i = 0; i < 9; ++i)
+ {
+ if(fabs(b1[i] - b2[i]) > EPS)
+ return false;
+ }
+ return true;
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Matrix check for rotation
+ @param m the matrix
+ @param tol tolerance for numerical checks */
+template
+__HOSTDEVICE__ static INLINE bool isRotation(const Matrix3& m,
+ const T tol = EPS) noexcept
+{
+ // Check if determinant is approximately 1
+ T det = determinant(m);
+ if(fabs(det - T(1)) > tol)
+ return false;
+
+ // Check if matrix * transpose(matrix) = identity
+ Matrix3 inv(transpose(m));
+ inv = m * inv;
+
+ // Check diagonal elements are approximately 1
+ // clang-format off
+ if(fabs(inv(XX) - T(1)) > tol ||
+ fabs(inv(YY) - T(1)) > tol ||
+ fabs(inv(ZZ) - T(1)) > tol)
+ return false;
+
+ // Check off-diagonal elements are approximately 0
+ if(fabs(inv(XY)) > tol || fabs(inv(XZ)) > tol || fabs(inv(YX)) > tol ||
+ fabs(inv(YZ)) > tol || fabs(inv(ZX)) > tol || fabs(inv(ZY)) > tol)
+ return false;
+ // clang-format on
+ return true;
+}
//@}
#endif
\ No newline at end of file
diff --git a/Grains/Base/include/Misc_Kernels.hh b/Grains/Base/include/Misc_Kernels.hh
deleted file mode 100644
index ec01557c..00000000
--- a/Grains/Base/include/Misc_Kernels.hh
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef _MISC_KERNELS_HH_
-#define _MISC_KERNELS_HH_
-
-#include "Basic.hh"
-
-// =============================================================================
-/** @brief Miscellaneous kernels for Grains simulation.
-
- This header file contains miscellaneous kernels that are used in the Grains
- simulation. These kernels are used for various purposes such as initializing
- buffers, computing hashes, and other utility functions.
-
- @author A.Yazdani - 2025 - Construction */
-// =============================================================================
-/** @name Miscellaneous kernels */
-//@{
-/** @brief Fills a buffer to default
- @param buffer the buffer to be initialized
- @param size size of the buffer
- @param value the default value to be set (default is T()) */
-template
-__GLOBAL__ void fill_Kernel(T* buffer, const size_t size, const T& value = T())
-{
- size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
- if(idx >= size)
- return;
- buffer[idx] = value;
-}
-
-// -----------------------------------------------------------------------------
-/** @brief fills a buffer to incremental unsigned i
- @param cells pointer to the Cells object
- @param transforms buffer of transformations
- @param size size of the buffer
- @param particleHash output buffer for particle hashes */
-template
-__GLOBAL__ void fillIncremental_Kernel(T* buffer, const size_t size)
-{
- size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
- if(idx >= size)
- return;
- buffer[idx] = static_cast(idx);
-}
-//@}
-
-#endif
\ No newline at end of file
diff --git a/Grains/Base/include/QuaternionMath.hh b/Grains/Base/include/QuaternionMath.hh
index 6553ae6f..82ce7eb9 100644
--- a/Grains/Base/include/QuaternionMath.hh
+++ b/Grains/Base/include/QuaternionMath.hh
@@ -16,125 +16,525 @@
/** @name Quaternion math functions and operators */
//@{
/** @brief Returns the norm of the quaternion
-@param q the quaternion */
+ @param q the quaternion */
template
__HOSTDEVICE__ static INLINE T norm(const Quaternion& q) noexcept
{
- return (sqrt(norm2(q.getVector()) + q.getScalar() * q.getScalar()));
+ const T* __RESTRICT__ b = q.getBuffer();
+ return (sqrt(b[0] * b[0] + b[1] * b[1] + b[2] * b[2] + b[3] * b[3]));
}
// -----------------------------------------------------------------------------
/** @brief Returns the norm squared of the quaternion
-@param q the quaternion */
+ @param q the quaternion */
template
__HOSTDEVICE__ static INLINE T norm2(const Quaternion& q) noexcept
{
- return (norm2(q.getVector()) + q.getScalar() * q.getScalar());
+ const T* __RESTRICT__ b = q.getBuffer();
+ return (b[0] * b[0] + b[1] * b[1] + b[2] * b[2] + b[3] * b[3]);
}
// -----------------------------------------------------------------------------
-/** @brief Returns the conjugate of the quaternion
-@param q the quaternion */
+/** @brief Quaternion conjugate
+ @param q the quaternion */
template
__HOSTDEVICE__ static INLINE Quaternion
conjugate(const Quaternion& q) noexcept
{
- return (Quaternion(-q.getVector(), q.getScalar));
+ const T* __RESTRICT__ b = q.getBuffer();
+ T __RESTRICT__ out[4];
+ out[0] = -b[0];
+ out[1] = -b[1];
+ out[2] = -b[2];
+ out[3] = b[3];
+ return (Quaternion(out));
}
// -----------------------------------------------------------------------------
-/** @brief Returns the inverse of the quaternion
-@param q the quaternion */
+/** @brief Quaternion conjugate in-place
+ @param q the quaternion */
+template
+__HOSTDEVICE__ static INLINE void conjugate(Quaternion& q) noexcept
+{
+ T* __RESTRICT__ b = const_cast(q.getBuffer());
+ b[0] = -b[0];
+ b[1] = -b[1];
+ b[2] = -b[2];
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Quaternion inverse
+ @param q the quaternion */
template
__HOSTDEVICE__ static INLINE Quaternion
inverse(const Quaternion& q) noexcept
{
- return ((T(1) / norm(q)) * conjugate(q));
+ const T* __RESTRICT__ b = q.getBuffer();
+ T __RESTRICT__ out[4];
+ T norm_inv = T(1) / norm(q);
+ out[0] = -norm_inv * b[0];
+ out[1] = -norm_inv * b[1];
+ out[2] = -norm_inv * b[2];
+ out[3] = norm_inv * b[3];
+ return (Quaternion(out));
}
// -----------------------------------------------------------------------------
-/** @brief Sum of 2 quaternions, i.e., q1 + q2
-@param q1 1st quaternion
-@param q2 2nd quaternion */
+/** @brief Quaternion inverse in-place
+ @param q the quaternion */
+template
+__HOSTDEVICE__ static INLINE void inverse(Quaternion& q) noexcept
+{
+ T* __RESTRICT__ b = const_cast(q.getBuffer());
+ T norm_inv = T(1) / norm(q);
+ b[0] = -norm_inv * b[0];
+ b[1] = -norm_inv * b[1];
+ b[2] = -norm_inv * b[2];
+ b[3] = norm_inv * b[3];
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Quaternions addition
+ @param q1 1st quaternion
+ @param q2 2nd quaternion */
template
__HOSTDEVICE__ static INLINE Quaternion
operator+(const Quaternion& q1, const Quaternion& q2) noexcept
{
- return (Quaternion(q1[0] + q2[0],
- q1[1] + q2[1],
- q1[2] + q2[2],
- q1[3] + q2[3]));
+ const T* __RESTRICT__ b1 = q1.getBuffer();
+ const T* __RESTRICT__ b2 = q2.getBuffer();
+ T __RESTRICT__ out[4];
+ for(uint i = 0; i < 4; ++i)
+ out[i] = b1[i] + b2[i];
+ return (Quaternion(out));
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Quaternions addition in-place
+ @param q1 1st quaternion
+ @param q2 2nd quaternion */
+template
+__HOSTDEVICE__ static INLINE void operator+=(Quaternion& q1,
+ const Quaternion& q2) noexcept
+{
+ T* __RESTRICT__ b1 = const_cast(q1.getBuffer());
+ const T* __RESTRICT__ b2 = q2.getBuffer();
+ for(uint i = 0; i < 4; ++i)
+ b1[i] += b2[i];
}
// -----------------------------------------------------------------------------
-/** @brief Subtraction of 2 quaternions, i.e., q1 - q2
-@param q1 1st quaternion
-@param q2 2nd quaternion */
+/** @brief Quaternions subtraction
+ @param q1 1st quaternion
+ @param q2 2nd quaternion */
template
__HOSTDEVICE__ static INLINE Quaternion
operator-(const Quaternion& q1, const Quaternion& q2) noexcept
{
- return (Quaternion(q1[0] - q2[0],
- q1[1] - q2[1],
- q1[2] - q2[2],
- q1[3] - q2[3]));
+ const T* __RESTRICT__ b1 = q1.getBuffer();
+ const T* __RESTRICT__ b2 = q2.getBuffer();
+ T __RESTRICT__ out[4];
+ for(uint i = 0; i < 4; ++i)
+ out[i] = b1[i] - b2[i];
+ return (Quaternion(out));
}
-// ----------------------------------------------------------------------------
-/** @brief Multiplication by a scalar
-@param d the multiplication factor
-@param q the quaternion */
+// -----------------------------------------------------------------------------
+/** @brief Quaternions subtraction in-place
+ @param q1 1st quaternion
+ @param q2 2nd quaternion */
+template
+__HOSTDEVICE__ static INLINE void operator-=(Quaternion& q1,
+ const Quaternion& q2) noexcept
+{
+ T* __RESTRICT__ b1 = const_cast(q1.getBuffer());
+ const T* __RESTRICT__ b2 = q2.getBuffer();
+ for(uint i = 0; i < 4; ++i)
+ b1[i] -= b2[i];
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Scalar-quaternion multiplication
+ @param d the multiplication factor
+ @param q the quaternion */
template
__HOSTDEVICE__ static INLINE Quaternion
operator*(T d, const Quaternion& q) noexcept
{
- return (Quaternion(d * q[0], d * q[1], d * q[2], d * q[3]));
+ const T* __RESTRICT__ b = q.getBuffer();
+ T __RESTRICT__ out[4];
+ for(uint i = 0; i < 4; ++i)
+ out[i] = d * b[i];
+ return (Quaternion(out));
}
-// ----------------------------------------------------------------------------
-/** @brief double product q1 x q2 of 2 quaternions
-@param q1 1st quaternion
-@param q2 2nd quaternion */
+// -----------------------------------------------------------------------------
+/** @brief Scalar-quaternion multiplication in-place
+ @param d the multiplication factor
+ @param q the quaternion */
template
-__HOSTDEVICE__ static INLINE Quaternion
- operator*(const Quaternion& q1, const Quaternion& q2) noexcept
+__HOSTDEVICE__ static INLINE void operator*=(Quaternion& q, T d) noexcept
{
- T w1 = q1.getScalar();
- Vector3 v1 = q1.getVector();
- T w2 = q2.getScalar();
- Vector3 v2 = q2.getVector();
- T tmp = (w1 * w2) - (v1 * v2);
- Vector3 vtmp((v1 ^ v2) + (w1 * v2) + (w2 * v1));
- return (Quaternion(vtmp, tmp));
+ T* __RESTRICT__ b = const_cast(q.getBuffer());
+ for(uint i = 0; i < 4; ++i)
+ b[i] *= d;
}
-// ----------------------------------------------------------------------------
-/** @brief double product on the right of a quaternion by a vector [ 0, v ],
-i.e., q x [ 0, v ]
-@param v the vector
-@param q the quaternion */
+// -----------------------------------------------------------------------------
+/** @brief Quaternion-vector multiplication
+ @param q the quaternion
+ @param v the vector */
template
__HOSTDEVICE__ static INLINE Quaternion
operator*(const Quaternion& q, const Vector3& v) noexcept
{
- T tmp = -q.getVector() * v;
- Vector3 vtmp((q.getVector() ^ v) + (q.getScalar() * v));
- return (Quaternion(vtmp, tmp));
+ const T* __RESTRICT__ b1 = q.getBuffer();
+ const T* __RESTRICT__ b2 = v.getBuffer();
+ T __RESTRICT__ out[4];
+ out[0] = (b1[3] * b2[0]) + (b1[1] * b2[2]) - (b1[2] * b2[1]);
+ out[1] = (b1[3] * b2[1]) + (b1[2] * b2[0]) - (b1[0] * b2[2]);
+ out[2] = (b1[3] * b2[2]) + (b1[0] * b2[1]) - (b1[1] * b2[0]);
+ out[3] = -(b1[0] * b2[0]) - (b1[1] * b2[1]) - (b1[2] * b2[2]);
+ return (Quaternion(out));
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Quaternion-vector multiplication in-place
+ @param q the quaternion
+ @param v the vector */
+template
+__HOSTDEVICE__ static INLINE void operator*=(Quaternion& q,
+ const Vector3& v) noexcept
+{
+ T* __RESTRICT__ b1 = const_cast(q.getBuffer());
+ const T* __RESTRICT__ b2 = v.getBuffer();
+ T out[3];
+ out[0] = (b1[3] * b2[0]) + (b1[1] * b2[2]) - (b1[2] * b2[1]);
+ out[1] = (b1[3] * b2[1]) + (b1[2] * b2[0]) - (b1[0] * b2[2]);
+ out[2] = (b1[3] * b2[2]) + (b1[0] * b2[1]) - (b1[1] * b2[0]);
+ b1[3] = -(b1[0] * b2[0]) - (b1[1] * b2[1]) - (b1[2] * b2[2]);
+ b1[0] = out[0];
+ b1[1] = out[1];
+ b1[2] = out[2];
}
-// ----------------------------------------------------------------------------
-/** @brief double product on the left of a quaternion by a vector [ 0, v ],
-i.e., [ 0, v ] x q
-@param q the quaternion
-@param v the vector */
+// -----------------------------------------------------------------------------
+/** @brief Vector-quaternion multiplication
+ @param v the vector
+ @param q the quaternion */
template
__HOSTDEVICE__ static INLINE Quaternion
operator*(const Vector3& v, const Quaternion& q) noexcept
{
- T tmp = -v * q.getVector();
- Vector3 vtmp((v ^ q.getVector()) + (q.getScalar() * v));
- return (Quaternion(vtmp, tmp));
+ const T* __RESTRICT__ b1 = v.getBuffer();
+ const T* __RESTRICT__ b2 = q.getBuffer();
+ T __RESTRICT__ out[4];
+ out[0] = (b1[0] * b2[3]) + (b1[1] * b2[2]) - (b1[2] * b2[1]);
+ out[1] = (b1[1] * b2[3]) + (b1[2] * b2[0]) - (b1[0] * b2[2]);
+ out[2] = (b1[2] * b2[3]) + (b1[0] * b2[1]) - (b1[1] * b2[0]);
+ out[3] = -(b1[0] * b2[0]) - (b1[1] * b2[1]) - (b1[2] * b2[2]);
+ return (Quaternion(out));
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Vector-quaternion multiplication in-place. Note that this modifies
+ the quaternion, not the vector.
+ @param v the vector
+ @param q the quaternion */
+template
+__HOSTDEVICE__ static INLINE void operator*=(const Vector3& v,
+ Quaternion& q) noexcept
+{
+ const T* __RESTRICT__ b1 = v.getBuffer();
+ T* __RESTRICT__ b2 = const_cast(q.getBuffer());
+ T out[3];
+ out[0] = (b1[0] * b2[3]) + (b1[1] * b2[2]) - (b1[2] * b2[1]);
+ out[1] = (b1[1] * b2[3]) + (b1[2] * b2[0]) - (b1[0] * b2[2]);
+ out[2] = (b1[2] * b2[3]) + (b1[0] * b2[1]) - (b1[1] * b2[0]);
+ b2[3] = -(b1[0] * b2[0]) - (b1[1] * b2[1]) - (b1[2] * b2[2]);
+ b2[0] = out[0];
+ b2[1] = out[1];
+ b2[2] = out[2];
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Rotates a vector by the inverse of a quaternion. Unit quaternion is
+ assumed.
+ @param q the quaternion
+ @param v the vector */
+template
+__HOSTDEVICE__ static INLINE Vector3 operator<<(const Quaternion& q,
+ const Vector3& v) noexcept
+{
+ const T* __RESTRICT__ b1 = q.getBuffer();
+ const T* __RESTRICT__ b2 = v.getBuffer();
+ T __RESTRICT__ out[3];
+ T tx = T(2) * (b1[2] * b2[1] - b1[1] * b2[2]);
+ T ty = T(2) * (b1[0] * b2[2] - b1[2] * b2[0]);
+ T tz = T(2) * (b1[1] * b2[0] - b1[0] * b2[1]);
+ out[0] = b2[0] + b1[3] * tx - (b1[1] * tz - b1[2] * ty);
+ out[1] = b2[1] + b1[3] * ty - (b1[2] * tx - b1[0] * tz);
+ out[2] = b2[2] + b1[3] * tz - (b1[0] * ty - b1[1] * tx);
+ return (Vector3(out));
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Rotates a vector by the inverse of a quaternion in-place. Note that
+ this modifies the vector, not the quaternion.
+ @param v the vector
+ @param q the quaternion */
+template
+__HOSTDEVICE__ static INLINE void operator<<=(Vector3& v,
+ const Quaternion& q) noexcept
+{
+ T* __RESTRICT__ b1 = const_cast(v.getBuffer());
+ const T* __RESTRICT__ b2 = q.getBuffer();
+ T tx = T(2) * (b2[2] * b1[1] - b2[1] * b1[2]);
+ T ty = T(2) * (b2[0] * b1[2] - b2[2] * b1[0]);
+ T tz = T(2) * (b2[1] * b1[0] - b2[0] * b1[1]);
+ b1[0] += b2[3] * tx - (b2[1] * tz - b2[2] * ty);
+ b1[1] += b2[3] * ty - (b2[2] * tx - b2[0] * tz);
+ b1[2] += b2[3] * tz - (b2[0] * ty - b2[1] * tx);
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Rotates a vector by a quaternion. Unit quaternion is assumed.
+ @param q the quaternion
+ @param v the vector */
+template
+__HOSTDEVICE__ static INLINE Vector3 operator>>(const Quaternion& q,
+ const Vector3& v) noexcept
+{
+ // Using the formula: v' = v + 2 * (q x v) * q + (q.w^2 - |q.v|^2) * v
+ const T* __RESTRICT__ b1 = q.getBuffer();
+ const T* __RESTRICT__ b2 = v.getBuffer();
+ T __RESTRICT__ out[3];
+ // Compute t = 2 * (q_vec x v)
+ T tx = T(2) * (b1[1] * b2[2] - b1[2] * b2[1]);
+ T ty = T(2) * (b1[2] * b2[0] - b1[0] * b2[2]);
+ T tz = T(2) * (b1[0] * b2[1] - b1[1] * b2[0]);
+ // Compute v' = v + w * t + cross(q_vec, t)
+ out[0] = b2[0] + b1[3] * tx + (b1[1] * tz - b1[2] * ty);
+ out[1] = b2[1] + b1[3] * ty + (b1[2] * tx - b1[0] * tz);
+ out[2] = b2[2] + b1[3] * tz + (b1[0] * ty - b1[1] * tx);
+ return (Vector3(out));
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Rotates a vector by a quaternion in-place. Note that this modifies
+ the vector, not the quaternion. Also, Unit quaternion is assumed.
+ @param v the vector
+ @param q the quaternion */
+template
+__HOSTDEVICE__ static INLINE void operator>>=(Vector3& v,
+ const Quaternion& q) noexcept
+{
+ // Using the formula: v' = v + 2 * (q x v) * q + (q.w^2 - |q.v|^2) * v
+ T* __RESTRICT__ b1 = const_cast(v.getBuffer());
+ const T* __RESTRICT__ b2 = q.getBuffer();
+ // Compute t = 2 * (q_vec x v)
+ T tx = T(2) * (b2[1] * b1[2] - b2[2] * b1[1]);
+ T ty = T(2) * (b2[2] * b1[0] - b2[0] * b1[2]);
+ T tz = T(2) * (b2[0] * b1[1] - b2[1] * b1[0]);
+ // Compute v' = v + w * t + cross(q_vec, t)
+ b1[0] += b2[3] * tx + (b2[1] * tz - b2[2] * ty);
+ b1[1] += b2[3] * ty + (b2[2] * tx - b2[0] * tz);
+ b1[2] += b2[3] * tz + (b2[0] * ty - b2[1] * tx);
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Quaternion-quaternion multiplication
+ @param q1 1st quaternion
+ @param q2 2nd quaternion */
+template
+__HOSTDEVICE__ static INLINE Quaternion
+ operator*(const Quaternion& q1, const Quaternion& q2) noexcept
+{
+ const T* __RESTRICT__ b1 = q1.getBuffer();
+ const T* __RESTRICT__ b2 = q2.getBuffer();
+ T __RESTRICT__ out[4];
+ out[0]
+ = (b1[3] * b2[0]) + (b1[0] * b2[3]) + (b1[1] * b2[2]) - (b1[2] * b2[1]);
+ out[1]
+ = (b1[3] * b2[1]) + (b1[1] * b2[3]) + (b1[2] * b2[0]) - (b1[0] * b2[2]);
+ out[2]
+ = (b1[3] * b2[2]) + (b1[2] * b2[3]) + (b1[0] * b2[1]) - (b1[1] * b2[0]);
+ out[3]
+ = (b1[3] * b2[3]) - (b1[0] * b2[0]) - (b1[1] * b2[1]) - (b1[2] * b2[2]);
+ return (Quaternion(out));
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Quaternion-quaternion multiplication in-place
+ @param q1 1st quaternion
+ @param q2 2nd quaternion */
+template
+__HOSTDEVICE__ static INLINE void operator*=(Quaternion& q1,
+ const Quaternion& q2) noexcept
+{
+ T* __RESTRICT__ b1 = const_cast(q1.getBuffer());
+ const T* __RESTRICT__ b2 = q2.getBuffer();
+ T out[3];
+ out[0]
+ = (b1[3] * b2[0]) + (b1[0] * b2[3]) + (b1[1] * b2[2]) - (b1[2] * b2[1]);
+ out[1]
+ = (b1[3] * b2[1]) + (b1[1] * b2[3]) + (b1[2] * b2[0]) - (b1[0] * b2[2]);
+ out[2]
+ = (b1[3] * b2[2]) + (b1[2] * b2[3]) + (b1[0] * b2[1]) - (b1[1] * b2[0]);
+ b1[3]
+ = (b1[3] * b2[3]) - (b1[0] * b2[0]) - (b1[1] * b2[1]) - (b1[2] * b2[2]);
+ b1[0] = out[0];
+ b1[1] = out[1];
+ b1[2] = out[2];
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Quaternion equality operator
+ @param q1 1st quaternion
+ @param q2 2nd quaternion */
+template
+__HOSTDEVICE__ static INLINE bool operator==(const Quaternion& q1,
+ const Quaternion& q2) noexcept
+{
+ const T* __RESTRICT__ b1 = q1.getBuffer();
+ const T* __RESTRICT__ b2 = q2.getBuffer();
+ for(int i = 0; i < 4; ++i)
+ {
+ if(fabs(b1[i] - b2[i]) > EPS)
+ return false;
+ }
+ return true;
+}
+
+// -----------------------------------------------------------------------------
+/** @brief Quaternion inequality operator
+ @param q1 1st quaternion
+ @param q2 2nd quaternion */
+template
+__HOSTDEVICE__ static INLINE bool operator!=(const Quaternion& q1,
+ const Quaternion& q2) noexcept
+{
+ const T* __RESTRICT__ b1 = q1.getBuffer();
+ const T* __RESTRICT__ b2 = q2.getBuffer();
+ for(int i = 0; i < 4; ++i)
+ {
+ if(fabs(b1[i] - b2[i]) > EPS)
+ return true;
+ }
+ return false;
}
+
+// -----------------------------------------------------------------------------
+/** @brief Quaternion sign flip
+ @param q the quaternion */
+template
+__HOSTDEVICE__ static INLINE Quaternion
+ operator-(const Quaternion