Add SIMD support via xsimd library #207

Copilot · 2026-01-26T18:16:29Z

The CPMAddPackage("gh:xtensor-stack/xsimd#14.0.0") call introduces a supply-chain risk by fetching and building third-party code from GitHub pinned only to a mutable tag without any additional integrity verification. If the xtensor-stack/xsimd repository or its tags are compromised, CI/builds that have access to secrets could execute attacker-controlled code. To mitigate this, pin the dependency to an immutable commit SHA (and, if supported by CPM, enable checksum/signature verification) rather than relying solely on a version tag.

-Original file line number
+Diff line change
@@ Expand Up / @@ -75,6 +75,7 @@ else() @@
     endif()
     option(IPC_TOOLKIT_WITH_CUDA                  "Enable CUDA CCD"                               OFF)
+    option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                    ON)
     option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF)
     option(IPC_TOOLKIT_WITH_ROBIN_MAP             "Use Tessil's robin-map rather than std maps"    ON)
     option(IPC_TOOLKIT_WITH_ABSEIL                "Use Abseil's hash functions"                    ON)
@@ Expand All @@
     option(IPC_TOOLKIT_WITH_PROFILER              "Enable performance profiler"                   OFF)
     # Advanced options
-    option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                   OFF)
     option(IPC_TOOLKIT_WITH_CODE_COVERAGE         "Enable coverage reporting"                     OFF)
-    mark_as_advanced(IPC_TOOLKIT_WITH_SIMD)          # This does not work reliably
     mark_as_advanced(IPC_TOOLKIT_WITH_CODE_COVERAGE) # This is used in GitHub Actions
     # Set default minimum C++ standard
@@ Expand All / @@ -112,9 +111,10 @@ include(ipc_toolkit_use_colors) @@
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
     ################################################################################
-    # CUDA
+    # Verify Options
     ################################################################################
+    # CUDA support
     if(IPC_TOOLKIT_WITH_CUDA)
       # If CMAKE_CUDA_ARCHITECTURES was not specified, set it to native.
       if(DEFINED CMAKE_CUDA_ARCHITECTURES)
@@ Expand All / @@ -129,6 +129,19 @@ if(IPC_TOOLKIT_WITH_CUDA) @@
       enable_language(CUDA)
     endif()
+    ## SIMD support
+    if(IPC_TOOLKIT_WITH_SIMD)
+      # Figure out SIMD support
+      message(STATUS "Testing SIMD capabilities...")
+      find_package(SIMD)
+      if (SIMD_CXX_FLAGS)
+        message(STATUS "SIMD support found: ${SIMD_CXX_FLAGS}")
+      else()
+        message(WARNING "SIMD support requested but not found. Continuing without SIMD.")
+        set(IPC_TOOLKIT_WITH_SIMD OFF CACHE BOOL "Enable SIMD" FORCE)
+      endif()
+    endif()
     ################################################################################
     # IPC Toolkit Library
     ################################################################################
@@ Expand Down Expand Up @@
     ## SIMD support
     if(IPC_TOOLKIT_WITH_SIMD)
-      # Figure out SIMD support
-      message(STATUS "Testing SIMD capabilities...")
-      find_package(SIMD)
       # Add SIMD flags to compiler flags
-      message(STATUS "Using SIMD flags: ${SIMD_FLAGS}")
-      target_compile_options(ipc_toolkit PRIVATE ${SIMD_FLAGS})
-    else()
-      message(STATUS "SIMD support disabled")
+      target_compile_options(ipc_toolkit PRIVATE ${SIMD_CXX_FLAGS})
+      # Link against cross-platform xsimd library
+      include(xsimd)
+      target_link_libraries(ipc_toolkit PRIVATE xsimd::xsimd)
+      # Disable vectorization in Eigen since I've found it to have alignment issues.
+      target_compile_definitions(Eigen3_Eigen INTERFACE EIGEN_DONT_VECTORIZE=1)
     endif()
     # For MSVC, do not use the min and max macros.
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -51,16 +51,6 @@ @@
             "IPC_TOOLKIT_WITH_CUDA": "ON"
           }
         },
-        {
-          "name": "simd",
-          "inherits": "release",
-          "displayName": "SIMD Enabled",
-          "description": "Build with SIMD optimizations",
-          "binaryDir": "${sourceDir}/build/simd",
-          "cacheVariables": {
-            "IPC_TOOLKIT_WITH_SIMD": "ON"
-          }
-        },
         {
           "name": "test",
           "inherits": "debug",
@@ Expand All / @@ -82,7 +72,6 @@ @@
           "cacheVariables": {
             "IPC_TOOLKIT_BUILD_PYTHON": "ON",
             "IPC_TOOLKIT_BUILD_TESTS": "OFF",
-            "IPC_TOOLKIT_WITH_SIMD": "OFF",
             "IPC_TOOLKIT_WITH_CUDA": "OFF"
           }
         },
@@ Expand Down Expand Up / @@ -166,4 +155,4 @@ @@
           }
         }
       ]
-    }
+    }

-Original file line number
+Diff line change
@@ Expand Up / @@ -31,12 +31,12 @@ @@
     # option(IPC_TOOLKIT_BUILD_TESTS                "Build unit-tests"                               ON)
     # option(IPC_TOOLKIT_BUILD_PYTHON               "Build Python bindings"                         OFF)
     # option(IPC_TOOLKIT_WITH_CUDA                  "Enable CUDA CCD"                               OFF)
+    # option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                    ON)
     # option(IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION "Use rational edge-triangle intersection check" OFF)
     # option(IPC_TOOLKIT_WITH_ROBIN_MAP             "Use Tessil's robin-map rather than std maps"    ON)
     # option(IPC_TOOLKIT_WITH_ABSEIL                "Use Abseil's hash functions"                    ON)
     # option(IPC_TOOLKIT_WITH_FILIB                 "Use filib for interval arithmetic"              ON)
     # option(IPC_TOOLKIT_WITH_INEXACT_CCD           "Use the original inexact CCD method of IPC"    OFF)
-    # option(IPC_TOOLKIT_WITH_SIMD                  "Enable SIMD"                                   OFF)
     # option(IPC_TOOLKIT_WITH_CODE_COVERAGE         "Enable coverage reporting"                     OFF)
     # option(IPC_TOOLKIT_TESTS_CCD_BENCHMARK        "Enable CCD benchmark test"                      ON)
     # set(IPC_TOOLKIT_TESTS_CCD_BENCHMARK_DIR     "" CACHE PATH "Path to the CCD benchmark directory")
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -0,0 +1,15 @@
+    # xsimd (https://github.com/xtensor-stack/xsimd)
+    # License: BSD-3-Clause
+    if(TARGET xsimd::xsimd)
+      return()
+    endif()
+    message(STATUS "Third-party: creating target 'xsimd::xsimd'")
+    include(CPM)
+    CPMAddPackage("gh:xtensor-stack/xsimd#14.0.0")
+    add_library(xsimd::xsimd ALIAS xsimd)
+    # Folder name for IDE
+    set_target_properties(xsimd PROPERTIES FOLDER "ThirdParty")

-Original file line number
+Diff line change
@@ Expand Up / @@ -9,11 +9,16 @@ @@
     #include <tbb/parallel_for.h>
     #include <tbb/parallel_sort.h>
-    #ifdef __APPLE__
-    // We utilize SIMD registers to compare 1 Node against 4 Queries simultaneously.
-    #include <simd/simd.h>
+    #ifdef IPC_TOOLKIT_WITH_SIMD
+    // We utilize SIMD registers to compare one node against multiple queries
+    // simultaneously, with the number of queries determined by
+    // xs::batch<float>::size.
+    #include <xsimd/xsimd.hpp>
+    namespace xs = xsimd;
     #endif
+    #include <array>
     using namespace std::placeholders;
     namespace ipc {
@@ Expand Down Expand Up / @@ -448,9 +453,9 @@ namespace { @@
             } while (node_idx != LBVH::Node::INVALID_POINTER); // Same as root
         }
-    #ifdef __APPLE__
+    #ifdef IPC_TOOLKIT_WITH_SIMD
         // SIMD Traversal
-        // Traverses 4 queries simultaneously using SIMD.
+        // Traverses multiple queries simultaneously using SIMD.
         template <typename Candidate, bool swap_order, bool triangular>
         void traverse_lbvh_simd(
             const LBVH::Node* queries,
@@ Expand All / @@ -459,28 +464,37 @@ namespace { @@
             const std::function<bool(size_t, size_t)>& can_collide,
             std::vector<Candidate>& candidates)
         {
-            assert(n_queries >= 1 && n_queries <= 4);
-            // Load 4 queries into single registers (Structure of Arrays)
-            auto make_simd = [&](auto F) -> simd_float4 {
-                return simd_float4 {
-                    F(0),
-                    n_queries > 1 ? F(1) : 0.0f,
-                    n_queries > 2 ? F(2) : 0.0f,
-                    n_queries > 3 ? F(3) : 0.0f,
-                };
+            using batch_t = xs::batch<float>;
+            assert(n_queries >= 1 && n_queries <= batch_t::size);
+            // Load queries into single registers
+            auto make_simd = [&](auto F) -> batch_t {
+                // 1. Create a buffer of the correct architecture-dependent size
+                alignas(xs::default_arch::alignment())
+                    std::array<float, batch_t::size>
+                        buffer {};
+    #pragma unroll
+                // 2. Fill the buffer, respecting the actual number of queries
+                for (size_t i = 0; i < batch_t::size; ++i) {
+                    buffer[i] = (i < n_queries) ? F(static_cast<int>(i)) : 0.0f;
+                }
+                // 3. Load the buffer into the SIMD register
+                return batch_t::load_aligned(buffer.data());
             };
-            const simd_float4 q_min_x =
+            const auto q_min_x =
                 make_simd([&](int k) { return queries[k].aabb_min.x(); });
-            const simd_float4 q_min_y =
+            const auto q_min_y =
                 make_simd([&](int k) { return queries[k].aabb_min.y(); });
-            const simd_float4 q_min_z =
+            const auto q_min_z =
                 make_simd([&](int k) { return queries[k].aabb_min.z(); });
-            const simd_float4 q_max_x =
+            const auto q_max_x =
                 make_simd([&](int k) { return queries[k].aabb_max.x(); });
-            const simd_float4 q_max_y =
+            const auto q_max_y =
                 make_simd([&](int k) { return queries[k].aabb_max.y(); });
-            const simd_float4 q_max_z =
+            const auto q_max_z =
                 make_simd([&](int k) { return queries[k].aabb_max.z(); });
             // Use a fixed-size array as a stack to avoid dynamic allocations
@@ Expand All / @@ -505,31 +519,33 @@ namespace { @@
                 const LBVH::Node& child_l = lbvh[node.left];
                 const LBVH::Node& child_r = lbvh[node.right];
-                // 1. Intersect 4 queries at once
+                // 1. Intersect multiple queries at once
                 // (child_l.min <= query.max) && (query.min <= child_l.max)
-                const simd_int4 intersects_l = (child_l.aabb_min.x() <= q_max_x)
+                const xs::batch_bool<float> intersects_l =
+                    (child_l.aabb_min.x() <= q_max_x)
                     & (child_l.aabb_min.y() <= q_max_y)
                     & (child_l.aabb_min.z() <= q_max_z)
                     & (q_min_x <= child_l.aabb_max.x())
                     & (q_min_y <= child_l.aabb_max.y())
                     & (q_min_z <= child_l.aabb_max.z());
-                // 2. Intersect 4 queries at once
+                // 2. Intersect multiple queries at once
                 // (child_r.min <= query.max) && (query.min <= child_r.max)
-                const simd_int4 intersects_r = (child_r.aabb_min.x() <= q_max_x)
+                const xs::batch_bool<float> intersects_r =
+                    (child_r.aabb_min.x() <= q_max_x)
                     & (child_r.aabb_min.y() <= q_max_y)
                     & (child_r.aabb_min.z() <= q_max_z)
                     & (q_min_x <= child_r.aabb_max.x())
                     & (q_min_y <= child_r.aabb_max.y())
                     & (q_min_z <= child_r.aabb_max.z());
-                const bool any_intersects_l = simd_any(intersects_l);
-                const bool any_intersects_r = simd_any(intersects_r);
+                const bool any_intersects_l = xs::any(intersects_l);
+                const bool any_intersects_r = xs::any(intersects_r);
                 // Query overlaps a leaf node => report collision
                 if (any_intersects_l && child_l.is_leaf()) {
                     for (int k = 0; k < n_queries; ++k) {
-                        if (intersects_l[k]) {
+                        if (intersects_l.get(k)) {
                             attempt_add_candidate<
                                 Candidate, swap_order, triangular>(
                                 queries[k], child_l, can_collide, candidates);
@@ Expand All / @@ -538,7 +554,7 @@ namespace { @@
                 }
                 if (any_intersects_r && child_r.is_leaf()) {
                     for (int k = 0; k < n_queries; ++k) {
-                        if (intersects_r[k]) {
+                        if (intersects_r.get(k)) {
                             attempt_add_candidate<
                                 Candidate, swap_order, triangular>(
                                 queries[k], child_r, can_collide, candidates);
@@ Expand Down Expand Up / @@ -576,9 +592,12 @@ namespace { @@
             const std::function<bool(size_t, size_t)>& can_collide,
             tbb::enumerable_thread_specific<std::vector<Candidate>>& storage)
         {
-    #ifdef __APPLE__ // Only support SIMD on Apple platforms for now
-            constexpr size_t SIMD_SIZE = use_simd ? 4 : 1;
-            constexpr size_t GRAIN_SIZE = use_simd ? 16 : 1;
+    #ifdef IPC_TOOLKIT_WITH_SIMD // Enable SIMD acceleration when available
+            constexpr size_t SIMD_SIZE = use_simd ? xs::batch<float>::size : 1;
+            static_assert(
+% xs::batch<float>::size == 0, "GRAIN_SIZE must be an integer");
+            constexpr size_t GRAIN_SIZE =
+                use_simd ? (64 / xs::batch<float>::size) : 1;
     #else
             constexpr size_t SIMD_SIZE = 1;
             constexpr size_t GRAIN_SIZE = 1;
@@ Expand All / @@ -595,11 +614,13 @@ namespace { @@
                 tbb::blocked_range<size_t>(size_t(0), n_tasks, GRAIN_SIZE),
                 [&](const tbb::blocked_range<size_t>& r) {
                     auto& local_candidates = storage.local();
+    #ifdef IPC_TOOLKIT_WITH_SIMD
                     const size_t actual_end = // Handle tail case
                         std::min(SIMD_SIZE * r.end(), n_source_leaves);
+    #endif
                     for (size_t i = r.begin(); i < r.end(); ++i) {
                         const size_t idx = SIMD_SIZE * i;
-    #ifdef __APPLE__
+    #ifdef IPC_TOOLKIT_WITH_SIMD
                         if constexpr (use_simd) {
                             assert(actual_end - idx >= 1);
                             traverse_lbvh_simd<Candidate, swap_order, triangular>(
@@ Expand All / @@ -611,7 +632,7 @@ namespace { @@
                             traverse_lbvh<Candidate, swap_order, triangular>(
                                 source[source_leaf_offset + idx], target,
                                 can_collide, local_candidates);
-    #ifdef __APPLE__
+    #ifdef IPC_TOOLKIT_WITH_SIMD
                         }
     #endif
                     }
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add SIMD support via xsimd library #207

Diff view

Diff view

There are no files selected for viewing

Copilot AI Jan 26, 2026

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ Expand Up / @@ -13,6 +13,7 @@ @@
     #cmakedefine IPC_TOOLKIT_WITH_INEXACT_CCD
     #cmakedefine IPC_TOOLKIT_WITH_RATIONAL_INTERSECTION
     #cmakedefine IPC_TOOLKIT_WITH_CUDA
+    #cmakedefine IPC_TOOLKIT_WITH_SIMD
     #cmakedefine IPC_TOOLKIT_WITH_ROBIN_MAP
     #cmakedefine IPC_TOOLKIT_WITH_ABSEIL
     #cmakedefine IPC_TOOLKIT_WITH_FILIB
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,13 @@ @@
     #include <cassert>
+    #ifdef EIGEN_DONT_VECTORIZE
+    // NOTE: Avoid error about abs casting double to int. Eigen does this
+    // internally but seemingly only if EIGEN_DONT_VECTORIZE is not defined.
+    // TODO: We should always use std::abs to avoid this issue.
+    EIGEN_USING_STD(abs); // using std::abs;
+    #endif
     namespace Eigen {
     template <typename T> using RowRef = Ref<T, 0, Eigen::InnerStride<>>;
     template <typename T> using ConstRef = const Ref<const T>&;
@@ Expand Down @@

Add SIMD support via xsimd library #207

Add SIMD support via xsimd library #207

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Copilot AI Jan 26, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!