Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,37 @@ jobs:
run: |
cd _build && ./test/test_xsimd

build-windows-clang-cl-fast-math:
name: 'clang-cl 19 x64 /fp:fast'
defaults:
run:
shell: bash {0}
runs-on: windows-2025
steps:
- name: Setup compiler
uses: ilammy/msvc-dev-cmd@v1
with:
arch: amd64
- name: Setup LLVM 19
run: |
choco install llvm --version=19.1.7 -y --no-progress
- name: Setup Ninja
run: |
python3 -m pip install --upgrade pip setuptools wheel
python3 -m pip install ninja
- name: Checkout xsimd
uses: actions/checkout@v3
- name: Setup
run: |
mkdir _build
cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=OFF -DBUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCMAKE_CXX_COMPILER="C:/Program Files/LLVM/bin/clang-cl.exe" -DCMAKE_CXX_FLAGS="/fp:fast" -G Ninja
- name: Build
run: |
cd _build && cmake --build .
- name: Testing xsimd
run: |
cd _build && ./test/test_xsimd

build-windows-arm64:
name: 'MSVC arm64'
defaults:
Expand Down
16 changes: 16 additions & 0 deletions include/xsimd/arch/common/xsimd_common_details.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,22 @@ namespace xsimd

namespace detail
{
template <class T>
XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept
{
#if XSIMD_WITH_INLINE_ASM
__asm__ volatile("" : : "r"(&x) : "memory");
#else
(void)x;
#endif
}

template <class T, class A>
XSIMD_INLINE void reassociation_barrier(T& x, A const&) noexcept
{
detail::reassociation_barrier(x, memory_barrier_tag {});
}

template <class F, class A, class T, class... Batches>
XSIMD_INLINE batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
{
Expand Down
10 changes: 1 addition & 9 deletions include/xsimd/arch/common/xsimd_common_math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1900,17 +1900,9 @@ namespace xsimd
batch_type s = bitofsign(self);
batch_type v = self ^ s;
batch_type t2n = constants::twotonmb<batch_type>();
// Under fast-math, reordering is possible and the compiler optimizes d
// to v. That's not what we want, so prevent compiler optimization here.
// FIXME: it may be better to emit a memory barrier here (?).
#ifdef __FAST_MATH__
batch_type d0 = v + t2n;
asm volatile("" ::"r"(&d0) : "memory");
detail::reassociation_barrier(d0.data, A {});
batch_type d = d0 - t2n;
#else
batch_type d0 = v + t2n;
batch_type d = d0 - t2n;
#endif
return s ^ select(v < t2n, d, v);
}
}
Expand Down
11 changes: 3 additions & 8 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -554,11 +554,8 @@ namespace xsimd
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
// The asm barrier forces f into a register before the add, blocking
// the reorder. It emits zero instructions.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
// Barrier the intermediate before the final add.
detail::reassociation_barrier(f, avx2 {});
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
}

Expand All @@ -575,9 +572,7 @@ namespace xsimd
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
detail::reassociation_barrier(f, avx2 {});
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
}
}
Expand Down
16 changes: 15 additions & 1 deletion include/xsimd/arch/xsimd_common_fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ namespace xsimd
class batch;
template <class T, class A>
class batch_bool;
namespace kernel
{
namespace detail
{
struct memory_barrier_tag
{
};
}
}
template <class T, class A, T... Vs>
struct batch_constant;
template <class T, class A, bool... Vs>
Expand Down Expand Up @@ -101,6 +110,12 @@ namespace xsimd
// Forward declarations for pack-level helpers
namespace detail
{
template <class T>
XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept;

template <class T, class A>
XSIMD_INLINE void reassociation_barrier(T& x, A const&) noexcept;

template <typename T, T... Vs>
XSIMD_INLINE constexpr bool is_identity() noexcept;
template <typename T, class A, T... Vs>
Expand All @@ -115,7 +130,6 @@ namespace xsimd
XSIMD_INLINE constexpr bool is_only_from_lo(batch_constant<T, A, Vs...>) noexcept;
template <typename T, class A, T... Vs>
XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant<T, A, Vs...>) noexcept;

}
}
}
Expand Down
4 changes: 4 additions & 0 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,8 @@ namespace xsimd
__m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
__m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
// Prevent -ffast-math from reassociating (xH-C)+xL into xH+(xL-C).
detail::reassociation_barrier(f, sse2 {});
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}

Expand All @@ -730,6 +732,8 @@ namespace xsimd
__m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
__m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
// Prevent -ffast-math from reassociating (xH-C)+xL into xH+(xL-C).
detail::reassociation_barrier(f, sse2 {});
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}

Expand Down
11 changes: 3 additions & 8 deletions include/xsimd/arch/xsimd_sse4_1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,8 @@ namespace xsimd
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
// The asm barrier forces f into a register before the add, blocking
// the reorder. It emits zero instructions.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
// Barrier the intermediate before the final add.
detail::reassociation_barrier(f, sse4_1 {});
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}

Expand All @@ -81,9 +78,7 @@ namespace xsimd
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
#if defined(__GNUC__)
__asm__ volatile("" : "+x"(f));
#endif
detail::reassociation_barrier(f, sse4_1 {});
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}
}
Expand Down
21 changes: 21 additions & 0 deletions include/xsimd/config/xsimd_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,27 @@
#define XSIMD_TARGET_X86 0
#endif

/**
* @ingroup xsimd_config_macro
*
* Set to 1 if GNU-style inline assembly is available, to 0 otherwise.
*/
/* Use __clang__ || __GNUC__ for GNU-style inline asm. clang-cl runs in
* MSVC-compatibility mode and does not define __GNUC__ by default, but it
* still defines __clang__. Clang documents __asm__/__asm__ support and broad
* GCC-extension compatibility:
* https://clang.llvm.org/docs/LanguageExtensions.html
* Clang only emits __GNUC__ when GNUCVersion != 0:
* https://raw.githubusercontent.com/llvm/llvm-project/main/clang/lib/Frontend/InitPreprocessor.cpp
* and GNUCVersion defaults to 0:
* https://raw.githubusercontent.com/llvm/llvm-project/main/clang/include/clang/Basic/LangOptions.def
*/
#if defined(__clang__) || defined(__GNUC__)
#define XSIMD_WITH_INLINE_ASM 1
#else
#define XSIMD_WITH_INLINE_ASM 0
#endif

/**
* @ingroup xsimd_config_macro
*
Expand Down
4 changes: 2 additions & 2 deletions include/xsimd/config/xsimd_cpu_features_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ namespace xsimd
__cpuid(buf, leaf);
std::memcpy(reg.data(), buf, sizeof(buf));

#elif defined(__GNUC__) || defined(__clang__)
#elif XSIMD_WITH_INLINE_ASM

#if defined(__i386__) && defined(__PIC__)
// %ebx may be the PIC register
Expand Down Expand Up @@ -561,7 +561,7 @@ namespace xsimd
#error "_MSC_VER < 1400 is not supported"
#endif

#elif defined(__GNUC__)
#elif XSIMD_WITH_INLINE_ASM
x86_reg32_t xcr0 = {};
__asm__(
"xorl %%ecx, %%ecx\n"
Expand Down
Loading