From d2bdd0f52ba0305662731b35dd80a2eda51aeb96 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 20 Mar 2026 15:02:58 -0400 Subject: [PATCH] fix: harden fast-math reassociation barriers --- .github/workflows/windows.yml | 31 +++++++++++++++++++ .../arch/common/xsimd_common_details.hpp | 16 ++++++++++ .../xsimd/arch/common/xsimd_common_math.hpp | 10 +----- include/xsimd/arch/xsimd_avx2.hpp | 11 ++----- include/xsimd/arch/xsimd_common_fwd.hpp | 16 +++++++++- include/xsimd/arch/xsimd_sse2.hpp | 4 +++ include/xsimd/arch/xsimd_sse4_1.hpp | 11 ++----- include/xsimd/config/xsimd_config.hpp | 21 +++++++++++++ .../xsimd/config/xsimd_cpu_features_x86.hpp | 4 +-- 9 files changed, 96 insertions(+), 28 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index e9e782aed..e907b4a0e 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -92,6 +92,37 @@ jobs: run: | cd _build && ./test/test_xsimd + build-windows-clang-cl-fast-math: + name: 'clang-cl 19 x64 /fp:fast' + defaults: + run: + shell: bash {0} + runs-on: windows-2025 + steps: + - name: Setup compiler + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: amd64 + - name: Setup LLVM 19 + run: | + choco install llvm --version=19.1.7 -y --no-progress + - name: Setup Ninja + run: | + python3 -m pip install --upgrade pip setuptools wheel + python3 -m pip install ninja + - name: Checkout xsimd + uses: actions/checkout@v3 + - name: Setup + run: | + mkdir _build + cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=OFF -DBUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCMAKE_CXX_FLAGS="/fp:fast" -G Ninja + - name: Build + run: | + cd _build && cmake --build . + - name: Testing xsimd + run: | + cd _build && ./test/test_xsimd + build-windows-arm64: name: 'MSVC arm64' defaults: diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp index efe01806b..4cb8c68c1 100644 --- a/include/xsimd/arch/common/xsimd_common_details.hpp +++ b/include/xsimd/arch/common/xsimd_common_details.hpp @@ -111,6 +111,22 @@ namespace xsimd namespace detail { + template + XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept + { +#if XSIMD_WITH_INLINE_ASM + __asm__ volatile("" : : "r"(&x) : "memory"); +#else + (void)x; +#endif + } + + template + XSIMD_INLINE void reassociation_barrier(T& x, A const&) noexcept + { + detail::reassociation_barrier(x, memory_barrier_tag {}); + } + template XSIMD_INLINE batch apply(F&& func, batch const& self, batch const& other) noexcept { diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp index f84883405..6fc06c1ea 100644 --- a/include/xsimd/arch/common/xsimd_common_math.hpp +++ b/include/xsimd/arch/common/xsimd_common_math.hpp @@ -1900,17 +1900,9 @@ namespace xsimd batch_type s = bitofsign(self); batch_type v = self ^ s; batch_type t2n = constants::twotonmb(); - // Under fast-math, reordering is possible and the compiler optimizes d - // to v. That's not what we want, so prevent compiler optimization here. - // FIXME: it may be better to emit a memory barrier here (?). -#ifdef __FAST_MATH__ batch_type d0 = v + t2n; - asm volatile("" ::"r"(&d0) : "memory"); + detail::reassociation_barrier(d0.data, A {}); batch_type d = d0 - t2n; -#else - batch_type d0 = v + t2n; - batch_type d = d0 - t2n; -#endif return s ^ select(v < t2n, d, v); } } diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 1eecabf7f..8045ff390 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -554,11 +554,8 @@ namespace xsimd __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 // With -ffast-math, the compiler may reassociate (xH-C)+xL into // xH+(xL-C). Since xL< class batch_bool; + namespace kernel + { + namespace detail + { + struct memory_barrier_tag + { + }; + } + } template struct batch_constant; template @@ -101,6 +110,12 @@ namespace xsimd // Forward declarations for pack-level helpers namespace detail { + template + XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept; + + template + XSIMD_INLINE void reassociation_barrier(T& x, A const&) noexcept; + template XSIMD_INLINE constexpr bool is_identity() noexcept; template @@ -115,7 +130,6 @@ namespace xsimd XSIMD_INLINE constexpr bool is_only_from_lo(batch_constant) noexcept; template XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant) noexcept; - } } } diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index e95cbbc83..3d2e8ff71 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -716,6 +716,8 @@ namespace xsimd __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 + // Prevent -ffast-math from reassociating (xH-C)+xL into xH+(xL-C). + detail::reassociation_barrier(f, sse2 {}); return _mm_add_pd(f, _mm_castsi128_pd(xL)); } @@ -730,6 +732,8 @@ namespace xsimd __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 + // Prevent -ffast-math from reassociating (xH-C)+xL into xH+(xL-C). + detail::reassociation_barrier(f, sse2 {}); return _mm_add_pd(f, _mm_castsi128_pd(xL)); } diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index 358a6b33b..c673a83a8 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -64,11 +64,8 @@ namespace xsimd __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 // With -ffast-math, the compiler may reassociate (xH-C)+xL into // xH+(xL-C). Since xL<