diff --git a/crates/core_arch/src/x86/avx10_2.rs b/crates/core_arch/src/x86/avx10_2.rs
new file mode 100644
index 0000000000..b9d150dc8b
--- /dev/null
+++ b/crates/core_arch/src/x86/avx10_2.rs
@@ -0,0 +1,13926 @@
+use crate::core_arch::{simd::*, x86::*};
+use crate::intrinsics::simd::*;
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Copies the lower 32 bits of `a` to the lower 32 bits of `dst`, zeroing the upper bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vmovd))]
+pub fn _mm_move_epi32(a: __m128i) -> __m128i {
+    unsafe {
+        let b: u32x4 = simd_shuffle!(a.as_u32x4(), u32x4::ZERO, [0, 4, 4, 4]);
+        b.as_m128i()
+    }
+}
+
+/// Copies the lower 16 bits of `a` to the lower 16 bits of `dst`, zeroing the upper bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vmovw))]
+pub fn _mm_move_epi16(a: __m128i) -> __m128i {
+    unsafe {
+        let b: u16x8 = simd_shuffle!(a.as_u16x8(), u16x8::ZERO, [0, 8, 8, 8, 8, 8, 8, 8]);
+        b.as_m128i()
+    }
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in `a`
+/// compared to those in `b`, and stores the 16-bit results in dst using writemask `k` (elements are
+/// copied from src when the corresponding mask bit is not set). Eight SADs are performed using one
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is selected from `b` starting
+/// at on the offset specified in `imm8`. Eight quadruplets are formed from sequential 8-bit integers
+/// selected from `a` starting at the offset specified in `imm8`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vmpsadbw, IMM8 = 0)
+)]
+pub fn _mm_mask_mpsadbw_epu8<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        simd_select_bitmask(k, _mm_mpsadbw_epu8::<IMM8>(a, b).as_u16x8(), src.as_u16x8()).as_m128i()
+    }
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in `a`
+/// compared to those in `b`, and stores the 16-bit results in dst using zeromask `k` (elements are
+/// zeroed out when the corresponding mask bit is not set). Eight SADs are performed using one
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is selected from `b` starting
+/// at on the offset specified in `imm8`. Eight quadruplets are formed from sequential 8-bit integers
+/// selected from `a` starting at the offset specified in `imm8`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vmpsadbw, IMM8 = 0)
+)]
+pub fn _mm_maskz_mpsadbw_epu8<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        simd_select_bitmask(k, _mm_mpsadbw_epu8::<IMM8>(a, b).as_u16x8(), u16x8::ZERO).as_m128i()
+    }
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in `a`
+/// compared to those in `b`, and stores the 16-bit results in dst using writemask `k` (elements are
+/// copied from src when the corresponding mask bit is not set). Eight SADs are performed for each
+/// 128-bit lane using one quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight quadruplets are formed from
+/// sequential 8-bit integers selected from `a` starting at the offset specified in `imm8`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vmpsadbw, IMM8 = 0)
+)]
+pub fn _mm256_mask_mpsadbw_epu8<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm256_mpsadbw_epu8::<IMM8>(a, b).as_u16x16(),
+            src.as_u16x16(),
+        )
+        .as_m256i()
+    }
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in `a`
+/// compared to those in `b`, and stores the 16-bit results in dst using zeromask `k` (elements are
+/// zeroed out when the corresponding mask bit is not set). Eight SADs are performed for each 128-bit
+/// lane using one quadruplet from `b` and eight quadruplets from `a`. One quadruplet is selected from
+/// `b` starting at on the offset specified in `imm8`. Eight quadruplets are formed from sequential
+/// 8-bit integers selected from `a` starting at the offset specified in `imm8`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vmpsadbw, IMM8 = 0)
+)]
+pub fn _mm256_maskz_mpsadbw_epu8<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm256_mpsadbw_epu8::<IMM8>(a, b).as_u16x16(),
+            u16x16::ZERO,
+        )
+        .as_m256i()
+    }
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in `a`
+/// compared to those in `b`, and stores the 16-bit results in dst. Eight SADs are performed for each
+/// 128-bit lane using one quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight quadruplets are formed from
+/// sequential 8-bit integers selected from `a` starting at the offset specified in `imm8`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vmpsadbw, IMM8 = 0)
+)]
+pub fn _mm512_mpsadbw_epu8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { vmpsadbw512(a.as_u8x64(), b.as_u8x64(), IMM8 as i8).as_m512i() }
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in `a`
+/// compared to those in `b`, and stores the 16-bit results in dst using writemask `k` (elements are
+/// copied from src when the corresponding mask bit is not set). Eight SADs are performed for each
+/// 128-bit lane using one quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight quadruplets are formed from
+/// sequential 8-bit integers selected from `a` starting at the offset specified in `imm8`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vmpsadbw, IMM8 = 0)
+)]
+pub fn _mm512_mask_mpsadbw_epu8<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_mpsadbw_epu8::<IMM8>(a, b).as_u16x32(),
+            src.as_u16x32(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in `a`
+/// compared to those in `b`, and stores the 16-bit results in dst using zeromask `k` (elements are
+/// zeroed out when the corresponding mask bit is not set). Eight SADs are performed for each 128-bit
+/// lane using one quadruplet from `b` and eight quadruplets from `a`. One quadruplet is selected from
+/// `b` starting at on the offset specified in `imm8`. Eight quadruplets are formed from sequential
+/// 8-bit integers selected from `a` starting at the offset specified in `imm8`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vmpsadbw, IMM8 = 0)
+)]
+pub fn _mm512_maskz_mpsadbw_epu8<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_mpsadbw_epu8::<IMM8>(a, b).as_u16x32(),
+            u16x32::ZERO,
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of half-precision (16-bit) floating-point numbers in a with
+/// corresponding half-precision (16-bit) floating-point numbers in b, producing 2 intermediate
+/// single-precision (32-bit) floating-point results. Sum these 2 results with the corresponding
+/// single-precision (32-bit) floating-point number in src, and store the packed 32-bit results in dst.
+///
+/// This neither consults nor updates MXCSR.RC, rather the rounding semantics are fixed to:
+///
+///  - Round to nearest, with ties to even
+///  - The two multiplications and two additions are fused into two FMA operations
+///  - Input denormals are treated as zero, and output denormals are flushed to zero
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vdpphps))]
+pub fn _mm_dpph_ps(src: __m128, a: __m128h, b: __m128h) -> __m128 {
+    unsafe { vdpphps128(src.as_f32x4(), a.as_f16x8(), b.as_f16x8()).as_m128() }
+}
+
+/// Multiply groups of 2 adjacent pairs of half-precision (16-bit) floating-point numbers in a with
+/// corresponding half-precision (16-bit) floating-point numbers in b, producing 2 intermediate
+/// single-precision (32-bit) floating-point results. Sum these 2 results with the corresponding
+/// single-precision (32-bit) floating-point number in src, and store the packed 32-bit results in dst
+/// using writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+///
+/// This neither consults nor updates MXCSR.RC, rather the rounding semantics are fixed to:
+///
+///  - Round to nearest, with ties to even
+///  - The two multiplications and two additions are fused into two FMA operations
+///  - Input denormals are treated as zero, and output denormals are flushed to zero
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vdpphps))]
+pub fn _mm_mask_dpph_ps(src: __m128, k: __mmask8, a: __m128h, b: __m128h) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_dpph_ps(src, a, b), src) }
+}
+
+/// Multiply groups of 2 adjacent pairs of half-precision (16-bit) floating-point numbers in a with
+/// corresponding half-precision (16-bit) floating-point numbers in b, producing 2 intermediate
+/// single-precision (32-bit) floating-point results. Sum these 2 results with the corresponding
+/// single-precision (32-bit) floating-point number in src, and store the packed 32-bit results in dst
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// This neither consults nor updates MXCSR.RC, rather the rounding semantics are fixed to:
+///
+///  - Round to nearest, with ties to even
+///  - The two multiplications and two additions are fused into two FMA operations
+///  - Input denormals are treated as zero, and output denormals are flushed to zero
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vdpphps))]
+pub fn _mm_maskz_dpph_ps(k: __mmask8, src: __m128, a: __m128h, b: __m128h) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_dpph_ps(src, a, b), _mm_setzero_ps()) }
+}
+
+/// Multiply groups of 2 adjacent pairs of half-precision (16-bit) floating-point numbers in a with
+/// corresponding half-precision (16-bit) floating-point numbers in b, producing 2 intermediate
+/// single-precision (32-bit) floating-point results. Sum these 2 results with the corresponding
+/// single-precision (32-bit) floating-point number in src, and store the packed 32-bit results in dst.
+///
+/// This neither consults nor updates MXCSR.RC, rather the rounding semantics are fixed to:
+///
+///  - Round to nearest, with ties to even
+///  - The two multiplications and two additions are fused into two FMA operations
+///  - Input denormals are treated as zero, and output denormals are flushed to zero
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vdpphps))]
+pub fn _mm256_dpph_ps(src: __m256, a: __m256h, b: __m256h) -> __m256 {
+    unsafe { vdpphps256(src.as_f32x8(), a.as_f16x16(), b.as_f16x16()).as_m256() }
+}
+
+/// Multiply groups of 2 adjacent pairs of half-precision (16-bit) floating-point numbers in a with
+/// corresponding half-precision (16-bit) floating-point numbers in b, producing 2 intermediate
+/// single-precision (32-bit) floating-point results. Sum these 2 results with the corresponding
+/// single-precision (32-bit) floating-point number in src, and store the packed 32-bit results in dst
+/// using writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+///
+/// This neither consults nor updates MXCSR.RC, rather the rounding semantics are fixed to:
+///
+///  - Round to nearest, with ties to even
+///  - The two multiplications and two additions are fused into two FMA operations
+///  - Input denormals are treated as zero, and output denormals are flushed to zero
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vdpphps))]
+pub fn _mm256_mask_dpph_ps(src: __m256, k: __mmask8, a: __m256h, b: __m256h) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_dpph_ps(src, a, b), src) }
+}
+
+/// Multiply groups of 2 adjacent pairs of half-precision (16-bit) floating-point numbers in a with
+/// corresponding half-precision (16-bit) floating-point numbers in b, producing 2 intermediate
+/// single-precision (32-bit) floating-point results. Sum these 2 results with the corresponding
+/// single-precision (32-bit) floating-point number in src, and store the packed 32-bit results in dst
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// This neither consults nor updates MXCSR.RC, rather the rounding semantics are fixed to:
+///
+///  - Round to nearest, with ties to even
+///  - The two multiplications and two additions are fused into two FMA operations
+///  - Input denormals are treated as zero, and output denormals are flushed to zero
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vdpphps))]
+pub fn _mm256_maskz_dpph_ps(k: __mmask8, src: __m256, a: __m256h, b: __m256h) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_dpph_ps(src, a, b), _mm256_setzero_ps()) }
+}
+
+/// Multiply groups of 2 adjacent pairs of half-precision (16-bit) floating-point numbers in a with
+/// corresponding half-precision (16-bit) floating-point numbers in b, producing 2 intermediate
+/// single-precision (32-bit) floating-point results. Sum these 2 results with the corresponding
+/// single-precision (32-bit) floating-point number in src, and store the packed 32-bit results in dst.
+///
+/// This neither consults nor updates MXCSR.RC, rather the rounding semantics are fixed to:
+///
+///  - Round to nearest, with ties to even
+///  - The two multiplications and two additions are fused into two FMA operations
+///  - Input denormals are treated as zero, and output denormals are flushed to zero
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vdpphps))]
+pub fn _mm512_dpph_ps(src: __m512, a: __m512h, b: __m512h) -> __m512 {
+    unsafe { vdpphps512(src.as_f32x16(), a.as_f16x32(), b.as_f16x32()).as_m512() }
+}
+
+/// Multiply groups of 2 adjacent pairs of half-precision (16-bit) floating-point numbers in a with
+/// corresponding half-precision (16-bit) floating-point numbers in b, producing 2 intermediate
+/// single-precision (32-bit) floating-point results. Sum these 2 results with the corresponding
+/// single-precision (32-bit) floating-point number in src, and store the packed 32-bit results in dst
+/// using writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+///
+/// This neither consults nor updates MXCSR.RC, rather the rounding semantics are fixed to:
+///
+///  - Round to nearest, with ties to even
+///  - The two multiplications and two additions are fused into two FMA operations
+///  - Input denormals are treated as zero, and output denormals are flushed to zero
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vdpphps))]
+pub fn _mm512_mask_dpph_ps(src: __m512, k: __mmask16, a: __m512h, b: __m512h) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_dpph_ps(src, a, b), src) }
+}
+
+/// Multiply groups of 2 adjacent pairs of half-precision (16-bit) floating-point numbers in a with
+/// corresponding half-precision (16-bit) floating-point numbers in b, producing 2 intermediate
+/// single-precision (32-bit) floating-point results. Sum these 2 results with the corresponding
+/// single-precision (32-bit) floating-point number in src, and store the packed 32-bit results in dst
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// This neither consults nor updates MXCSR.RC, rather the rounding semantics are fixed to:
+///
+///  - Round to nearest, with ties to even
+///  - The two multiplications and two additions are fused into two FMA operations
+///  - Input denormals are treated as zero, and output denormals are flushed to zero
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vdpphps))]
+pub fn _mm512_maskz_dpph_ps(k: __mmask16, src: __m512, a: __m512h, b: __m512h) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_dpph_ps(src, a, b), _mm512_setzero_ps()) }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssd))]
+pub fn _mm_mask_dpbssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbssd_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssd))]
+pub fn _mm_maskz_dpbssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbssd_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssd))]
+pub fn _mm256_mask_dpbssd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpbssd_epi32(src, a, b).as_i32x8(), src.as_i32x8()).as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssd))]
+pub fn _mm256_maskz_dpbssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpbssd_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssd))]
+pub fn _mm512_dpbssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpbssd512(src.as_i32x16(), a.as_i8x64(), b.as_i8x64()).as_m512i() }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssd))]
+pub fn _mm512_mask_dpbssd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpbssd_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssd))]
+pub fn _mm512_maskz_dpbssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpbssd_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssds))]
+pub fn _mm_mask_dpbssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbssds_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssds))]
+pub fn _mm_maskz_dpbssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbssds_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssds))]
+pub fn _mm256_mask_dpbssds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm256_dpbssds_epi32(src, a, b).as_i32x8(),
+            src.as_i32x8(),
+        )
+        .as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssds))]
+pub fn _mm256_maskz_dpbssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpbssds_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssds))]
+pub fn _mm512_dpbssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpbssds512(src.as_i32x16(), a.as_i8x64(), b.as_i8x64()).as_m512i() }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssds))]
+pub fn _mm512_mask_dpbssds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpbssds_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbssds))]
+pub fn _mm512_maskz_dpbssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpbssds_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsud))]
+pub fn _mm_mask_dpbsud_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbsud_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsud))]
+pub fn _mm_maskz_dpbsud_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbsud_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsud))]
+pub fn _mm256_mask_dpbsud_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpbsud_epi32(src, a, b).as_i32x8(), src.as_i32x8()).as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsud))]
+pub fn _mm256_maskz_dpbsud_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpbsud_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsud))]
+pub fn _mm512_dpbsud_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpbsud512(src.as_i32x16(), a.as_i8x64(), b.as_u8x64()).as_m512i() }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsud))]
+pub fn _mm512_mask_dpbsud_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpbsud_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsud))]
+pub fn _mm512_maskz_dpbsud_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpbsud_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsuds))]
+pub fn _mm_mask_dpbsuds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbsuds_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsuds))]
+pub fn _mm_maskz_dpbsuds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbsuds_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsuds))]
+pub fn _mm256_mask_dpbsuds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm256_dpbsuds_epi32(src, a, b).as_i32x8(),
+            src.as_i32x8(),
+        )
+        .as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsuds))]
+pub fn _mm256_maskz_dpbsuds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpbsuds_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsuds))]
+pub fn _mm512_dpbsuds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpbsuds512(src.as_i32x16(), a.as_i8x64(), b.as_u8x64()).as_m512i() }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsuds))]
+pub fn _mm512_mask_dpbsuds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpbsuds_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbsuds))]
+pub fn _mm512_maskz_dpbsuds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpbsuds_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuud))]
+pub fn _mm_mask_dpbuud_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbuud_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuud))]
+pub fn _mm_maskz_dpbuud_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbuud_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuud))]
+pub fn _mm256_mask_dpbuud_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpbuud_epi32(src, a, b).as_i32x8(), src.as_i32x8()).as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuud))]
+pub fn _mm256_maskz_dpbuud_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpbuud_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuud))]
+pub fn _mm512_dpbuud_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpbuud512(src.as_i32x16(), a.as_u8x64(), b.as_u8x64()).as_m512i() }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuud))]
+pub fn _mm512_mask_dpbuud_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpbuud_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuud))]
+pub fn _mm512_maskz_dpbuud_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpbuud_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuuds))]
+pub fn _mm_mask_dpbuuds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbuuds_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuuds))]
+pub fn _mm_maskz_dpbuuds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpbuuds_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuuds))]
+pub fn _mm256_mask_dpbuuds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm256_dpbuuds_epi32(src, a, b).as_i32x8(),
+            src.as_i32x8(),
+        )
+        .as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint8")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuuds))]
+pub fn _mm256_maskz_dpbuuds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpbuuds_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuuds))]
+pub fn _mm512_dpbuuds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpbuuds512(src.as_i32x16(), a.as_u8x64(), b.as_u8x64()).as_m512i() }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuuds))]
+pub fn _mm512_mask_dpbuuds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpbuuds_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpbuuds))]
+pub fn _mm512_maskz_dpbuuds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpbuuds_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsud))]
+pub fn _mm_mask_dpwsud_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwsud_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsud))]
+pub fn _mm_maskz_dpwsud_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwsud_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsud))]
+pub fn _mm256_mask_dpwsud_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpwsud_epi32(src, a, b).as_i32x8(), src.as_i32x8()).as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsud))]
+pub fn _mm256_maskz_dpwsud_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpwsud_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsud))]
+pub fn _mm512_dpwsud_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpwsud512(src.as_i32x16(), a.as_i16x32(), b.as_u16x32()).as_m512i() }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsud))]
+pub fn _mm512_mask_dpwsud_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpwsud_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsud))]
+pub fn _mm512_maskz_dpwsud_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpwsud_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsuds))]
+pub fn _mm_mask_dpwsuds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwsuds_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsuds))]
+pub fn _mm_maskz_dpwsuds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwsuds_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsuds))]
+pub fn _mm256_mask_dpwsuds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm256_dpwsuds_epi32(src, a, b).as_i32x8(),
+            src.as_i32x8(),
+        )
+        .as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsuds))]
+pub fn _mm256_maskz_dpwsuds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpwsuds_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsuds))]
+pub fn _mm512_dpwsuds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpwsuds512(src.as_i32x16(), a.as_i16x32(), b.as_u16x32()).as_m512i() }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsuds))]
+pub fn _mm512_mask_dpwsuds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpwsuds_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwsuds))]
+pub fn _mm512_maskz_dpwsuds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpwsuds_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusd))]
+pub fn _mm_mask_dpwusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwusd_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusd))]
+pub fn _mm_maskz_dpwusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwusd_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusd))]
+pub fn _mm256_mask_dpwusd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpwusd_epi32(src, a, b).as_i32x8(), src.as_i32x8()).as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusd))]
+pub fn _mm256_maskz_dpwusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpwusd_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusd))]
+pub fn _mm512_dpwusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpwusd512(src.as_i32x16(), a.as_u16x32(), b.as_i16x32()).as_m512i() }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusd))]
+pub fn _mm512_mask_dpwusd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpwusd_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusd))]
+pub fn _mm512_maskz_dpwusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpwusd_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusds))]
+pub fn _mm_mask_dpwusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwusds_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusds))]
+pub fn _mm_maskz_dpwusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwusds_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusds))]
+pub fn _mm256_mask_dpwusds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm256_dpwusds_epi32(src, a, b).as_i32x8(),
+            src.as_i32x8(),
+        )
+        .as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusds))]
+pub fn _mm256_maskz_dpwusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpwusds_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusds))]
+pub fn _mm512_dpwusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpwusds512(src.as_i32x16(), a.as_u16x32(), b.as_i16x32()).as_m512i() }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusds))]
+pub fn _mm512_mask_dpwusds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpwusds_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwusds))]
+pub fn _mm512_maskz_dpwusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpwusds_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuud))]
+pub fn _mm_mask_dpwuud_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwuud_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuud))]
+pub fn _mm_maskz_dpwuud_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwuud_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuud))]
+pub fn _mm256_mask_dpwuud_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpwuud_epi32(src, a, b).as_i32x8(), src.as_i32x8()).as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuud))]
+pub fn _mm256_maskz_dpwuud_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpwuud_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuud))]
+pub fn _mm512_dpwuud_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpwuud512(src.as_i32x16(), a.as_u16x32(), b.as_u16x32()).as_m512i() }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask `k`
+/// (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuud))]
+pub fn _mm512_mask_dpwuud_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpwuud_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuud))]
+pub fn _mm512_maskz_dpwuud_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpwuud_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuuds))]
+pub fn _mm_mask_dpwuuds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwuuds_epi32(src, a, b).as_i32x4(), src.as_i32x4()).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuuds))]
+pub fn _mm_maskz_dpwuuds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        simd_select_bitmask(k, _mm_dpwuuds_epi32(src, a, b).as_i32x4(), i32x4::ZERO).as_m128i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuuds))]
+pub fn _mm256_mask_dpwuuds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm256_dpwuuds_epi32(src, a, b).as_i32x8(),
+            src.as_i32x8(),
+        )
+        .as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2,avxvnniint16")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuuds))]
+pub fn _mm256_maskz_dpwuuds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_dpwuuds_epi32(src, a, b).as_i32x8(), i32x8::ZERO).as_m256i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuuds))]
+pub fn _mm512_dpwuuds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vdpwuuds512(src.as_i32x16(), a.as_u16x32(), b.as_u16x32()).as_m512i() }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using writemask `k` (elements are copied from src when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuuds))]
+pub fn _mm512_mask_dpwuuds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(
+            k,
+            _mm512_dpwuuds_epi32(src, a, b).as_i32x16(),
+            src.as_i32x16(),
+        )
+        .as_m512i()
+    }
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the
+/// corresponding 32-bit integer in src with signed saturation, and store the packed 32-bit results
+/// in dst using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vpdpwuuds))]
+pub fn _mm512_maskz_dpwuuds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_dpwuuds_epi32(src, a, b).as_i32x16(), i32x16::ZERO).as_m512i()
+    }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit
+/// integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttsd2sis, SAE = 8)
+)]
+pub fn _mm_cvtts_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
+    static_assert_sae!(SAE);
+    unsafe { vcvttsd2sis(a.as_f64x2(), SAE) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit
+/// integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+pub fn _mm_cvtts_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
+    _mm_cvtts_roundsd_i32::<SAE>(a)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 32-bit
+/// unsigned integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttsd2usis, SAE = 8)
+)]
+pub fn _mm_cvtts_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
+    static_assert_sae!(SAE);
+    unsafe { vcvttsd2usis(a.as_f64x2(), SAE) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit
+/// integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttss2sis, SAE = 8)
+)]
+pub fn _mm_cvtts_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
+    static_assert_sae!(SAE);
+    unsafe { vcvttss2sis(a.as_f32x4(), SAE) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit
+/// integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+pub fn _mm_cvtts_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
+    _mm_cvtts_roundss_i32::<SAE>(a)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 32-bit
+/// unsigned integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttss2usis, SAE = 8)
+)]
+pub fn _mm_cvtts_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
+    static_assert_sae!(SAE);
+    unsafe { vcvttss2usis(a.as_f32x4(), SAE) }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst`.
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2dqs))]
+pub fn _mm_cvtts_pd_epi32(a: __m128d) -> __m128i {
+    _mm_mask_cvtts_pd_epi32(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2dqs))]
+pub fn _mm_mask_cvtts_pd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { vcvttpd2dqs_128(a.as_f64x2(), src.as_i32x4(), k).as_m128i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2dqs))]
+pub fn _mm_maskz_cvtts_pd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvtts_pd_epi32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2dqs))]
+pub fn _mm256_cvtts_pd_epi32(a: __m256d) -> __m128i {
+    _mm256_mask_cvtts_pd_epi32(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2dqs))]
+pub fn _mm256_mask_cvtts_pd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { vcvttpd2dqs_256(a.as_f64x4(), src.as_i32x4(), k).as_m128i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2dqs))]
+pub fn _mm256_maskz_cvtts_pd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    _mm256_mask_cvtts_pd_epi32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2dqs))]
+pub fn _mm512_cvtts_pd_epi32(a: __m512d) -> __m256i {
+    _mm512_mask_cvtts_pd_epi32(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2dqs))]
+pub fn _mm512_mask_cvtts_pd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    _mm512_mask_cvtts_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2dqs))]
+pub fn _mm512_maskz_cvtts_pd_epi32(k: __mmask8, a: __m512d) -> __m256i {
+    _mm512_mask_cvtts_pd_epi32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2dqs, SAE = 8)
+)]
+pub fn _mm512_cvtts_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
+    _mm512_mask_cvtts_roundpd_epi32::<SAE>(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2dqs, SAE = 8)
+)]
+pub fn _mm512_mask_cvtts_roundpd_epi32<const SAE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttpd2dqs_512(a.as_f64x8(), src.as_i32x8(), k, SAE).as_m256i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2dqs, SAE = 8)
+)]
+pub fn _mm512_maskz_cvtts_roundpd_epi32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    _mm512_mask_cvtts_roundpd_epi32::<SAE>(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2udqs))]
+pub fn _mm_cvtts_pd_epu32(a: __m128d) -> __m128i {
+    _mm_mask_cvtts_pd_epu32(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2udqs))]
+pub fn _mm_mask_cvtts_pd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { vcvttpd2udqs_128(a.as_f64x2(), src.as_u32x4(), k).as_m128i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2udqs))]
+pub fn _mm_maskz_cvtts_pd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvtts_pd_epu32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2udqs))]
+pub fn _mm256_cvtts_pd_epu32(a: __m256d) -> __m128i {
+    _mm256_mask_cvtts_pd_epu32(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2udqs))]
+pub fn _mm256_mask_cvtts_pd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { vcvttpd2udqs_256(a.as_f64x4(), src.as_u32x4(), k).as_m128i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2udqs))]
+pub fn _mm256_maskz_cvtts_pd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    _mm256_mask_cvtts_pd_epu32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2udqs))]
+pub fn _mm512_cvtts_pd_epu32(a: __m512d) -> __m256i {
+    _mm512_mask_cvtts_pd_epu32(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2udqs))]
+pub fn _mm512_mask_cvtts_pd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    _mm512_mask_cvtts_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2udqs))]
+pub fn _mm512_maskz_cvtts_pd_epu32(k: __mmask8, a: __m512d) -> __m256i {
+    _mm512_mask_cvtts_pd_epu32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2udqs, SAE = 8)
+)]
+pub fn _mm512_cvtts_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
+    _mm512_mask_cvtts_roundpd_epu32::<SAE>(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2udqs, SAE = 8)
+)]
+pub fn _mm512_mask_cvtts_roundpd_epu32<const SAE: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttpd2udqs_512(a.as_f64x8(), src.as_u32x8(), k, SAE).as_m256i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2udqs, SAE = 8)
+)]
+pub fn _mm512_maskz_cvtts_roundpd_epu32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    _mm512_mask_cvtts_roundpd_epu32::<SAE>(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2qqs))]
+pub fn _mm_cvtts_pd_epi64(a: __m128d) -> __m128i {
+    _mm_mask_cvtts_pd_epi64(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2qqs))]
+pub fn _mm_mask_cvtts_pd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { vcvttpd2qqs_128(a.as_f64x2(), src.as_i64x2(), k).as_m128i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2qqs))]
+pub fn _mm_maskz_cvtts_pd_epi64(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvtts_pd_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2qqs))]
+pub fn _mm256_cvtts_pd_epi64(a: __m256d) -> __m256i {
+    _mm256_mask_cvtts_pd_epi64(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2qqs))]
+pub fn _mm256_mask_cvtts_pd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { vcvttpd2qqs_256(a.as_f64x4(), src.as_i64x4(), k).as_m256i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2qqs))]
+pub fn _mm256_maskz_cvtts_pd_epi64(k: __mmask8, a: __m256d) -> __m256i {
+    _mm256_mask_cvtts_pd_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2qqs))]
+pub fn _mm512_cvtts_pd_epi64(a: __m512d) -> __m512i {
+    _mm512_mask_cvtts_pd_epi64(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2qqs))]
+pub fn _mm512_mask_cvtts_pd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvtts_roundpd_epi64::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2qqs))]
+pub fn _mm512_maskz_cvtts_pd_epi64(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvtts_pd_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2qqs, SAE = 8)
+)]
+pub fn _mm512_cvtts_roundpd_epi64<const SAE: i32>(a: __m512d) -> __m512i {
+    _mm512_mask_cvtts_roundpd_epi64::<SAE>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2qqs, SAE = 8)
+)]
+pub fn _mm512_mask_cvtts_roundpd_epi64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttpd2qqs_512(a.as_f64x8(), src.as_i64x8(), k, SAE).as_m512i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2qqs, SAE = 8)
+)]
+pub fn _mm512_maskz_cvtts_roundpd_epi64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvtts_roundpd_epi64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2uqqs))]
+pub fn _mm_cvtts_pd_epu64(a: __m128d) -> __m128i {
+    _mm_mask_cvtts_pd_epu64(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2uqqs))]
+pub fn _mm_mask_cvtts_pd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { vcvttpd2uqqs_128(a.as_f64x2(), src.as_u64x2(), k).as_m128i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2uqqs))]
+pub fn _mm_maskz_cvtts_pd_epu64(k: __mmask8, a: __m128d) -> __m128i {
+    _mm_mask_cvtts_pd_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2uqqs))]
+pub fn _mm256_cvtts_pd_epu64(a: __m256d) -> __m256i {
+    _mm256_mask_cvtts_pd_epu64(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2uqqs))]
+pub fn _mm256_mask_cvtts_pd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { vcvttpd2uqqs_256(a.as_f64x4(), src.as_u64x4(), k).as_m256i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2uqqs))]
+pub fn _mm256_maskz_cvtts_pd_epu64(k: __mmask8, a: __m256d) -> __m256i {
+    _mm256_mask_cvtts_pd_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst`
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2uqqs))]
+pub fn _mm512_cvtts_pd_epu64(a: __m512d) -> __m512i {
+    _mm512_mask_cvtts_pd_epu64(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2uqqs))]
+pub fn _mm512_mask_cvtts_pd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvtts_roundpd_epu64::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttpd2uqqs))]
+pub fn _mm512_maskz_cvtts_pd_epu64(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvtts_pd_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2uqqs, SAE = 8)
+)]
+pub fn _mm512_cvtts_roundpd_epu64<const SAE: i32>(a: __m512d) -> __m512i {
+    _mm512_mask_cvtts_roundpd_epu64::<SAE>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2uqqs, SAE = 8)
+)]
+pub fn _mm512_mask_cvtts_roundpd_epu64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512d,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttpd2uqqs_512(a.as_f64x8(), src.as_u64x8(), k, SAE).as_m512i() }
+}
+
+/// Convert packed double-precision (64-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttpd2uqqs, SAE = 8)
+)]
+pub fn _mm512_maskz_cvtts_roundpd_epu64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
+    _mm512_mask_cvtts_roundpd_epu64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2dqs))]
+pub fn _mm_cvtts_ps_epi32(a: __m128) -> __m128i {
+    _mm_mask_cvtts_ps_epi32(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2dqs))]
+pub fn _mm_mask_cvtts_ps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { vcvttps2dqs_128(a.as_f32x4(), src.as_i32x4(), k).as_m128i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2dqs))]
+pub fn _mm_maskz_cvtts_ps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvtts_ps_epi32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2dqs))]
+pub fn _mm256_cvtts_ps_epi32(a: __m256) -> __m256i {
+    _mm256_mask_cvtts_ps_epi32(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2dqs))]
+pub fn _mm256_mask_cvtts_ps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { vcvttps2dqs_256(a.as_f32x8(), src.as_i32x8(), k).as_m256i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2dqs))]
+pub fn _mm256_maskz_cvtts_ps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    _mm256_mask_cvtts_ps_epi32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2dqs))]
+pub fn _mm512_cvtts_ps_epi32(a: __m512) -> __m512i {
+    _mm512_mask_cvtts_ps_epi32(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2dqs))]
+pub fn _mm512_mask_cvtts_ps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_cvtts_roundps_epi32::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2dqs))]
+pub fn _mm512_maskz_cvtts_ps_epi32(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_cvtts_ps_epi32(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2dqs, SAE = 8)
+)]
+pub fn _mm512_cvtts_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
+    _mm512_mask_cvtts_roundps_epi32::<SAE>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2dqs, SAE = 8)
+)]
+pub fn _mm512_mask_cvtts_roundps_epi32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    unsafe { vcvttps2dqs_512(a.as_f32x16(), src.as_i32x16(), k, SAE).as_m512i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2dqs, SAE = 8)
+)]
+pub fn _mm512_maskz_cvtts_roundps_epi32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_cvtts_roundps_epi32::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2udqs))]
+pub fn _mm_cvtts_ps_epu32(a: __m128) -> __m128i {
+    _mm_mask_cvtts_ps_epu32(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2udqs))]
+pub fn _mm_mask_cvtts_ps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { vcvttps2udqs_128(a.as_f32x4(), src.as_u32x4(), k).as_m128i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2udqs))]
+pub fn _mm_maskz_cvtts_ps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvtts_ps_epu32(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2udqs))]
+pub fn _mm256_cvtts_ps_epu32(a: __m256) -> __m256i {
+    _mm256_mask_cvtts_ps_epu32(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2udqs))]
+pub fn _mm256_mask_cvtts_ps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { vcvttps2udqs_256(a.as_f32x8(), src.as_u32x8(), k).as_m256i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2udqs))]
+pub fn _mm256_maskz_cvtts_ps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    _mm256_mask_cvtts_ps_epu32(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst`
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2udqs))]
+pub fn _mm512_cvtts_ps_epu32(a: __m512) -> __m512i {
+    _mm512_mask_cvtts_ps_epu32(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2udqs))]
+pub fn _mm512_mask_cvtts_ps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_cvtts_roundps_epu32::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2udqs))]
+pub fn _mm512_maskz_cvtts_ps_epu32(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_cvtts_ps_epu32(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2udqs, SAE = 8)
+)]
+pub fn _mm512_cvtts_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
+    _mm512_mask_cvtts_roundps_epu32::<SAE>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2udqs, SAE = 8)
+)]
+pub fn _mm512_mask_cvtts_roundps_epu32<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttps2udqs_512(a.as_f32x16(), src.as_u32x16(), k, SAE).as_m512i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 32-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2udqs, SAE = 8)
+)]
+pub fn _mm512_maskz_cvtts_roundps_epu32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_cvtts_roundps_epu32::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2qqs))]
+pub fn _mm_cvtts_ps_epi64(a: __m128) -> __m128i {
+    _mm_mask_cvtts_ps_epi64(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements from the lower 64 bits of `a`
+/// to packed 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2qqs))]
+pub fn _mm_mask_cvtts_ps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { vcvttps2qqs_128(a.as_f32x4(), src.as_i64x2(), k).as_m128i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements from the lower 64 bits of  `a`
+/// to packed 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2qqs))]
+pub fn _mm_maskz_cvtts_ps_epi64(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvtts_ps_epi64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements from the lower 64 bits of  `a`
+/// to packed 64-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2qqs))]
+pub fn _mm256_cvtts_ps_epi64(a: __m128) -> __m256i {
+    _mm256_mask_cvtts_ps_epi64(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2qqs))]
+pub fn _mm256_mask_cvtts_ps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { vcvttps2qqs_256(a.as_f32x4(), src.as_i64x4(), k).as_m256i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2qqs))]
+pub fn _mm256_maskz_cvtts_ps_epi64(k: __mmask8, a: __m128) -> __m256i {
+    _mm256_mask_cvtts_ps_epi64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2qqs))]
+pub fn _mm512_cvtts_ps_epi64(a: __m256) -> __m512i {
+    _mm512_mask_cvtts_ps_epi64(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2qqs))]
+pub fn _mm512_mask_cvtts_ps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvtts_roundps_epi64::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2qqs))]
+pub fn _mm512_maskz_cvtts_ps_epi64(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvtts_ps_epi64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2qqs, SAE = 8)
+)]
+pub fn _mm512_cvtts_roundps_epi64<const SAE: i32>(a: __m256) -> __m512i {
+    _mm512_mask_cvtts_roundps_epi64::<SAE>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2qqs, SAE = 8)
+)]
+pub fn _mm512_mask_cvtts_roundps_epi64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m256,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttps2qqs_512(a.as_f32x8(), src.as_i64x8(), k, SAE).as_m512i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2qqs, SAE = 8)
+)]
+pub fn _mm512_maskz_cvtts_roundps_epi64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvtts_roundps_epi64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements from the lower 64 bits of  `a`
+/// to packed 64-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2uqqs))]
+pub fn _mm_cvtts_ps_epu64(a: __m128) -> __m128i {
+    _mm_mask_cvtts_ps_epu64(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements from the lower 64 bits of  `a`
+/// to packed 64-bit unsigned integers with truncation and saturation, and store the results in `dst`
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2uqqs))]
+pub fn _mm_mask_cvtts_ps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { vcvttps2uqqs_128(a.as_f32x4(), src.as_u64x2(), k).as_m128i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements from the lower 64 bits of  `a`
+/// to packed 64-bit unsigned integers with truncation and saturation, and store the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2uqqs))]
+pub fn _mm_maskz_cvtts_ps_epu64(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_cvtts_ps_epu64(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2uqqs))]
+pub fn _mm256_cvtts_ps_epu64(a: __m128) -> __m256i {
+    _mm256_mask_cvtts_ps_epu64(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2uqqs))]
+pub fn _mm256_mask_cvtts_ps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { vcvttps2uqqs_256(a.as_f32x4(), src.as_u64x4(), k).as_m256i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2uqqs))]
+pub fn _mm256_maskz_cvtts_ps_epu64(k: __mmask8, a: __m128) -> __m256i {
+    _mm256_mask_cvtts_ps_epu64(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2uqqs))]
+pub fn _mm512_cvtts_ps_epu64(a: __m256) -> __m512i {
+    _mm512_mask_cvtts_ps_epu64(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2uqqs))]
+pub fn _mm512_mask_cvtts_ps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvtts_roundps_epu64::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2uqqs))]
+pub fn _mm512_maskz_cvtts_ps_epu64(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvtts_ps_epu64(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2uqqs, SAE = 8)
+)]
+pub fn _mm512_cvtts_roundps_epu64<const SAE: i32>(a: __m256) -> __m512i {
+    _mm512_mask_cvtts_roundps_epu64::<SAE>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2uqqs, SAE = 8)
+)]
+pub fn _mm512_mask_cvtts_roundps_epu64<const SAE: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m256,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttps2uqqs_512(a.as_f32x8(), src.as_u64x8(), k, SAE).as_m512i() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
+/// 64-bit unsigned integers with truncation and saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2uqqs, SAE = 8)
+)]
+pub fn _mm512_maskz_cvtts_roundps_epu64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
+    _mm512_mask_cvtts_roundps_epu64::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+///
+/// -------------------------------------------------------------------------------------------------------------------------------
+/// |`IMM8[4]`|`IMM8[1:0]`|        Operation        |         Description                                                         |
+/// |:-------:|:---------:|:-----------------------:|:----------------------------------------------------------------------------|
+/// |    0    |    00     |         Minimum         | `a` if `a<=b`, `b` if `b<a`, and qNan if either operand is NaN              |
+/// |    0    |    01     |         Maximum         | `a` if `a>=b`, `b` if `b>a`, and qNan if either operand is NaN              |
+/// |    0    |    10     |     MinimumMagnitude    | `a` if `\|a\|<\|b\|`, `b` if `\|b\|<\|a\|`, otherwise `Minimum(a, b)`       |
+/// |    0    |    11     |     MaximumMagnitude    | `a` if `\|a\|>\|b\|`, `b` if `\|b\|>\|a\|`, otherwise `Maximum(a, b)`       |
+/// |    1    |    00     |       MinimumNumber     | `a` if `a<=b`, `b` if `b<a`. If only one operand is NaN, the other one is returned. If both operands are NaNs, a qNaN is returned |
+/// |    1    |    01     |       MaximumNumber     | `a` if `a>=b`, `b` if `b>a`. If only one operand is NaN, the other one is returned. If both operands are NaNs, a qNaN is returned |
+/// |    1    |    10     |  MinimumMagnitudeNumber | `a` if `\|a\|<\|b\|`, `b` if `\|b\|<\|x\|`, otherwise `MinimumNumber(a, b)` |
+/// |    1    |    10     |  MaximumMagnitudeNumber | `a` if `\|a\|>\|b\|`, `b` if `\|b\|>\|x\|`, otherwise `MaximumNumber(a, b)` |
+/// -------------------------------------------------------------------------------------------------------------------------------
+///
+/// The sign of the output is decided using `IMM[3:2]`
+///
+/// ---------------------------------------------
+/// |`IMM8[3:2]`|              Sign             |
+/// |:---------:|:-----------------------------:|
+/// |     00    | Use sign of the first operand |
+/// |     01    | Preserve sign of the result   |
+/// |     10    | Set sign to +ve               |
+/// |     11    | Set sign to -ve               |
+/// ---------------------------------------------
+///
+/// For more details, including behaviour for NaNs and denormals, refer to the [AVX10.2 Spec].
+///
+/// [AVX10.2 Spec]: https://www.intel.com/content/www/us/en/content-details/913918/intel-advanced-vector-extensions-10-2-intel-avx10-2-architecture-specification.html
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0)
+)]
+pub fn _mm_minmax_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    _mm_mask_minmax_pd::<IMM8>(_mm_undefined_pd(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0)
+)]
+pub fn _mm_mask_minmax_pd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { vminmaxpd128(a.as_f64x2(), b.as_f64x2(), IMM8, src.as_f64x2(), k as u8).as_m128d() }
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0)
+)]
+pub fn _mm_maskz_minmax_pd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    _mm_mask_minmax_pd::<IMM8>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0)
+)]
+pub fn _mm256_minmax_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+    _mm256_mask_minmax_pd::<IMM8>(_mm256_undefined_pd(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0)
+)]
+pub fn _mm256_mask_minmax_pd<const IMM8: i32>(
+    src: __m256d,
+    k: __mmask8,
+    a: __m256d,
+    b: __m256d,
+) -> __m256d {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { vminmaxpd256(a.as_f64x4(), b.as_f64x4(), IMM8, src.as_f64x4(), k as u8).as_m256d() }
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0)
+)]
+pub fn _mm256_maskz_minmax_pd<const IMM8: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    _mm256_mask_minmax_pd::<IMM8>(_mm256_setzero_pd(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0)
+)]
+pub fn _mm512_minmax_pd<const IMM8: i32>(a: __m512d, b: __m512d) -> __m512d {
+    _mm512_mask_minmax_pd::<IMM8>(_mm512_undefined_pd(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0)
+)]
+pub fn _mm512_mask_minmax_pd<const IMM8: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    _mm512_mask_minmax_round_pd::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0)
+)]
+pub fn _mm512_maskz_minmax_pd<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    _mm512_mask_minmax_pd::<IMM8>(_mm512_setzero_pd(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm512_minmax_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    _mm512_mask_minmax_round_pd::<IMM8, SAE>(_mm512_undefined_pd(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm512_mask_minmax_round_pd<const IMM8: i32, const SAE: i32>(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    unsafe {
+        vminmaxpd512(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            IMM8,
+            src.as_f64x8(),
+            k as u8,
+            SAE,
+        )
+        .as_m512d()
+    }
+}
+
+/// Performs a min/max comparison between packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxpd, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm512_maskz_minmax_round_pd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+) -> __m512d {
+    _mm512_mask_minmax_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0)
+)]
+pub fn _mm_minmax_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    _mm_mask_minmax_ps::<IMM8>(_mm_undefined_ps(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0)
+)]
+pub fn _mm_mask_minmax_ps<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { vminmaxps128(a.as_f32x4(), b.as_f32x4(), IMM8, src.as_f32x4(), k as u8).as_m128() }
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0)
+)]
+pub fn _mm_maskz_minmax_ps<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    _mm_mask_minmax_ps::<IMM8>(_mm_setzero_ps(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0)
+)]
+pub fn _mm256_minmax_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    _mm256_mask_minmax_ps::<IMM8>(_mm256_undefined_ps(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0)
+)]
+pub fn _mm256_mask_minmax_ps<const IMM8: i32>(
+    src: __m256,
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { vminmaxps256(a.as_f32x8(), b.as_f32x8(), IMM8, src.as_f32x8(), k as u8).as_m256() }
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0)
+)]
+pub fn _mm256_maskz_minmax_ps<const IMM8: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    _mm256_mask_minmax_ps::<IMM8>(_mm256_setzero_ps(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0)
+)]
+pub fn _mm512_minmax_ps<const IMM8: i32>(a: __m512, b: __m512) -> __m512 {
+    _mm512_mask_minmax_ps::<IMM8>(_mm512_undefined_ps(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0)
+)]
+pub fn _mm512_mask_minmax_ps<const IMM8: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    _mm512_mask_minmax_round_ps::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0)
+)]
+pub fn _mm512_maskz_minmax_ps<const IMM8: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    _mm512_mask_minmax_ps::<IMM8>(_mm512_setzero_ps(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm512_minmax_round_ps<const IMM8: i32, const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    _mm512_mask_minmax_round_ps::<IMM8, SAE>(_mm512_undefined_ps(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm512_mask_minmax_round_ps<const IMM8: i32, const SAE: i32>(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    unsafe {
+        vminmaxps512(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            IMM8,
+            src.as_f32x16(),
+            k as u16,
+            SAE,
+        )
+        .as_m512()
+    }
+}
+
+/// Performs a min/max comparison between packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxps, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm512_maskz_minmax_round_ps<const IMM8: i32, const SAE: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    _mm512_mask_minmax_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0)
+)]
+pub fn _mm_minmax_ph<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_minmax_ph::<IMM8>(_mm_undefined_ph(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`. and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0)
+)]
+pub fn _mm_mask_minmax_ph<const IMM8: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe { vminmaxph128(a.as_f16x8(), b.as_f16x8(), IMM8, src.as_f16x8(), k as u8).as_m128h() }
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`. and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0)
+)]
+pub fn _mm_maskz_minmax_ph<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_minmax_ph::<IMM8>(_mm_setzero_ph(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0)
+)]
+pub fn _mm256_minmax_ph<const IMM8: i32>(a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_minmax_ph::<IMM8>(_mm256_undefined_ph(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`. and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0)
+)]
+pub fn _mm256_mask_minmax_ph<const IMM8: i32>(
+    src: __m256h,
+    k: __mmask16,
+    a: __m256h,
+    b: __m256h,
+) -> __m256h {
+    static_assert_uimm_bits!(IMM8, 8);
+    unsafe {
+        vminmaxph256(
+            a.as_f16x16(),
+            b.as_f16x16(),
+            IMM8,
+            src.as_f16x16(),
+            k as u16,
+        )
+        .as_m256h()
+    }
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`. and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0)
+)]
+pub fn _mm256_maskz_minmax_ph<const IMM8: i32>(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    _mm256_mask_minmax_ph::<IMM8>(_mm256_setzero_ph(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0)
+)]
+pub fn _mm512_minmax_ph<const IMM8: i32>(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_minmax_ph::<IMM8>(_mm512_undefined_ph(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`. and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0)
+)]
+pub fn _mm512_mask_minmax_ph<const IMM8: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    _mm512_mask_minmax_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`. and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0)
+)]
+pub fn _mm512_maskz_minmax_ph<const IMM8: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_minmax_ph::<IMM8>(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm512_minmax_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
+    _mm512_mask_minmax_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`. and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm512_mask_minmax_round_ph<const IMM8: i32, const SAE: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    unsafe {
+        vminmaxph512(
+            a.as_f16x32(),
+            b.as_f16x32(),
+            IMM8,
+            src.as_f16x32(),
+            k as u32,
+            SAE,
+        )
+        .as_m512h()
+    }
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`. and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxph, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm512_maskz_minmax_round_ph<const IMM8: i32, const SAE: i32>(
+    k: __mmask32,
+    a: __m512h,
+    b: __m512h,
+) -> __m512h {
+    _mm512_mask_minmax_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Performs a min/max comparison between the lower double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and copy the upper element from
+/// `a` to the upper element of `dst`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsd, IMM8 = 0)
+)]
+pub fn _mm_minmax_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    _mm_mask_minmax_sd::<IMM8>(_mm_undefined_pd(), !0, a, b)
+}
+
+/// Performs a min/max comparison between the lower double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set), and
+/// copy the upper element from `a` to the upper element of `dst`.
+///
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsd, IMM8 = 0)
+)]
+pub fn _mm_mask_minmax_sd<const IMM8: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    _mm_mask_minmax_round_sd::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Performs a min/max comparison between the lower double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set), and
+/// copy the upper element from `a` to the upper element of `dst`.
+///
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsd, IMM8 = 0)
+)]
+pub fn _mm_maskz_minmax_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    _mm_mask_minmax_sd::<IMM8>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Performs a min/max comparison between the lower double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set), and copy
+/// the upper element from `a` to the upper element of `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsd, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm_minmax_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    _mm_mask_minmax_round_sd::<IMM8, SAE>(_mm_undefined_pd(), !0, a, b)
+}
+
+/// Performs a min/max comparison between the lower double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set), and
+/// copy the upper element from `a` to the upper element of `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsd, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm_mask_minmax_round_sd<const IMM8: i32, const SAE: i32>(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    unsafe {
+        vminmaxsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            IMM8,
+            src.as_f64x2(),
+            k as u8,
+            SAE,
+        )
+        .as_m128d()
+    }
+}
+
+/// Performs a min/max comparison between the lower double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set), and
+/// copy the upper element from `a` to the upper element of `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsd, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm_maskz_minmax_round_sd<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    _mm_mask_minmax_round_sd::<IMM8, SAE>(_mm_setzero_pd(), k, a, b)
+}
+
+/// Performs a min/max comparison between the lower single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and copy the upper 3 packed elements
+/// from `a` to the upper elements of `dst`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxss, IMM8 = 0)
+)]
+pub fn _mm_minmax_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    _mm_mask_minmax_ss::<IMM8>(_mm_undefined_ps(), !0, a, b)
+}
+
+/// Performs a min/max comparison between the lower single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set), and
+/// copy the upper 3 packed elements from `a` to the upper elements of `dst`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxss, IMM8 = 0)
+)]
+pub fn _mm_mask_minmax_ss<const IMM8: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    _mm_mask_minmax_round_ss::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Performs a min/max comparison between the lower single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set), and copy
+/// the upper 3 packed elements from `a` to the upper elements of `dst`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxss, IMM8 = 0)
+)]
+pub fn _mm_maskz_minmax_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    _mm_mask_minmax_ss::<IMM8>(_mm_setzero_ps(), k, a, b)
+}
+
+/// Performs a min/max comparison between the lower single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and copy the upper 3 packed elements
+/// from `a` to the upper elements of `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxss, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm_minmax_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    _mm_mask_minmax_round_ss::<IMM8, SAE>(_mm_undefined_ps(), !0, a, b)
+}
+
+/// Performs a min/max comparison between the lower single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set), and
+/// copy the upper 3 packed elements from `a` to the upper elements of `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxss, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm_mask_minmax_round_ss<const IMM8: i32, const SAE: i32>(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    unsafe {
+        vminmaxss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            IMM8,
+            src.as_f32x4(),
+            k as u8,
+            SAE,
+        )
+        .as_m128()
+    }
+}
+
+/// Performs a min/max comparison between the lower single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set), and copy
+/// the upper 3 packed elements from `a` to the upper elements of `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxss, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm_maskz_minmax_round_ss<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    _mm_mask_minmax_round_ss::<IMM8, SAE>(_mm_setzero_ps(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and copy the upper 7 packed
+/// elements from `a` to the upper elements of `dst`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsh, IMM8 = 0)
+)]
+pub fn _mm_minmax_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_minmax_sh::<IMM8>(_mm_undefined_ph(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set), and
+/// copy the upper 7 packed elements from `a` to the upper elements of `dst`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsh, IMM8 = 0)
+)]
+pub fn _mm_mask_minmax_sh<const IMM8: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    _mm_mask_minmax_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set), and copy
+/// the upper 7 packed elements from `a` to the upper elements of `dst`.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsh, IMM8 = 0)
+)]
+pub fn _mm_maskz_minmax_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_minmax_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and copy the upper 7 packed
+/// elements from `a` to the upper elements of `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsh, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm_minmax_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+    _mm_mask_minmax_round_sh::<IMM8, SAE>(_mm_undefined_ph(), !0, a, b)
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// writemask `k` (elements are copied from src when the corresponding mask bit is not set), and
+/// copy the upper 7 packed elements from `a` to the upper elements of `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsh, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm_mask_minmax_round_sh<const IMM8: i32, const SAE: i32>(
+    src: __m128h,
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    static_assert_uimm_bits!(IMM8, 8);
+    static_assert_sae!(SAE);
+    unsafe {
+        vminmaxsh(
+            a.as_f16x8(),
+            b.as_f16x8(),
+            IMM8,
+            src.as_f16x8(),
+            k as u8,
+            SAE,
+        )
+        .as_m128h()
+    }
+}
+
+/// Performs a min/max comparison between packed half-precision (16-bit) floating-point
+/// elements in `a` and `b` based on the control in `IMM8`, and stores the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set), and copy
+/// the upper 7 packed elements from `a` to the upper elements of `dst`.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+/// For more details, see [`_mm_minmax_pd`].
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vminmaxsh, IMM8 = 0, SAE = 8)
+)]
+pub fn _mm_maskz_minmax_round_sh<const IMM8: i32, const SAE: i32>(
+    k: __mmask8,
+    a: __m128h,
+    b: __m128h,
+) -> __m128h {
+    _mm_mask_minmax_round_sh::<IMM8, SAE>(_mm_setzero_ph(), k, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing out the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2ibs))]
+pub fn _mm_ipcvts_ph_epi8(a: __m128h) -> __m128i {
+    _mm_mask_ipcvts_ph_epi8(_mm_undefined_si128(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2ibs))]
+pub fn _mm_mask_ipcvts_ph_epi8(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { vcvtph2ibs128(a.as_f16x8(), src.as_i16x8(), k).as_m128i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2ibs))]
+pub fn _mm_maskz_ipcvts_ph_epi8(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_ipcvts_ph_epi8(_mm_setzero_si128(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing out the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2ibs))]
+pub fn _mm256_ipcvts_ph_epi8(a: __m256h) -> __m256i {
+    _mm256_mask_ipcvts_ph_epi8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2ibs))]
+pub fn _mm256_mask_ipcvts_ph_epi8(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { vcvtph2ibs256(a.as_f16x16(), src.as_i16x16(), k).as_m256i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2ibs))]
+pub fn _mm256_maskz_ipcvts_ph_epi8(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_ipcvts_ph_epi8(_mm256_setzero_si256(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing out the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2ibs))]
+pub fn _mm512_ipcvts_ph_epi8(a: __m512h) -> __m512i {
+    _mm512_mask_ipcvts_ph_epi8(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2ibs))]
+pub fn _mm512_mask_ipcvts_ph_epi8(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvts_roundph_epi8::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2ibs))]
+pub fn _mm512_maskz_ipcvts_ph_epi8(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvts_ph_epi8(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing out the upper 8 bits.
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtph2ibs, ROUNDING = 8)
+)]
+pub fn _mm512_ipcvts_roundph_epi8<const ROUNDING: i32>(a: __m512h) -> __m512i {
+    _mm512_mask_ipcvts_roundph_epi8::<ROUNDING>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtph2ibs, ROUNDING = 8)
+)]
+pub fn _mm512_mask_ipcvts_roundph_epi8<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    unsafe { vcvtph2ibs512(a.as_f16x32(), src.as_i16x32(), k, ROUNDING).as_m512i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtph2ibs, ROUNDING = 8)
+)]
+pub fn _mm512_maskz_ipcvts_roundph_epi8<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvts_roundph_epi8::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing out the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2iubs))]
+pub fn _mm_ipcvts_ph_epu8(a: __m128h) -> __m128i {
+    _mm_mask_ipcvts_ph_epu8(_mm_undefined_si128(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2iubs))]
+pub fn _mm_mask_ipcvts_ph_epu8(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { vcvtph2iubs128(a.as_f16x8(), src.as_u16x8(), k).as_m128i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2iubs))]
+pub fn _mm_maskz_ipcvts_ph_epu8(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_ipcvts_ph_epu8(_mm_setzero_si128(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing out the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2iubs))]
+pub fn _mm256_ipcvts_ph_epu8(a: __m256h) -> __m256i {
+    _mm256_mask_ipcvts_ph_epu8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2iubs))]
+pub fn _mm256_mask_ipcvts_ph_epu8(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { vcvtph2iubs256(a.as_f16x16(), src.as_u16x16(), k).as_m256i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2iubs))]
+pub fn _mm256_maskz_ipcvts_ph_epu8(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_ipcvts_ph_epu8(_mm256_setzero_si256(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing out the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2iubs))]
+pub fn _mm512_ipcvts_ph_epu8(a: __m512h) -> __m512i {
+    _mm512_mask_ipcvts_ph_epu8(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2iubs))]
+pub fn _mm512_mask_ipcvts_ph_epu8(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvts_roundph_epu8::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set)
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2iubs))]
+pub fn _mm512_maskz_ipcvts_ph_epu8(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvts_ph_epu8(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing out the upper 8 bits.
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtph2iubs, ROUNDING = 8)
+)]
+pub fn _mm512_ipcvts_roundph_epu8<const ROUNDING: i32>(a: __m512h) -> __m512i {
+    _mm512_mask_ipcvts_roundph_epu8::<ROUNDING>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtph2iubs, ROUNDING = 8)
+)]
+pub fn _mm512_mask_ipcvts_roundph_epu8<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    unsafe { vcvtph2iubs512(a.as_f16x32(), src.as_u16x32(), k, ROUNDING).as_m512i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtph2iubs, ROUNDING = 8)
+)]
+pub fn _mm512_maskz_ipcvts_roundph_epu8<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvts_roundph_epu8::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing out the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2ibs))]
+pub fn _mm_ipcvts_ps_epi8(a: __m128) -> __m128i {
+    _mm_mask_ipcvts_ps_epi8(_mm_undefined_si128(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2ibs))]
+pub fn _mm_mask_ipcvts_ps_epi8(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { vcvtps2ibs128(a.as_f32x4(), src.as_i32x4(), k).as_m128i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2ibs))]
+pub fn _mm_maskz_ipcvts_ps_epi8(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_ipcvts_ps_epi8(_mm_setzero_si128(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing out the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2ibs))]
+pub fn _mm256_ipcvts_ps_epi8(a: __m256) -> __m256i {
+    _mm256_mask_ipcvts_ps_epi8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2ibs))]
+pub fn _mm256_mask_ipcvts_ps_epi8(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { vcvtps2ibs256(a.as_f32x8(), src.as_i32x8(), k).as_m256i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2ibs))]
+pub fn _mm256_maskz_ipcvts_ps_epi8(k: __mmask8, a: __m256) -> __m256i {
+    _mm256_mask_ipcvts_ps_epi8(_mm256_setzero_si256(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2ibs))]
+pub fn _mm512_ipcvts_ps_epi8(a: __m512) -> __m512i {
+    _mm512_mask_ipcvts_ps_epi8(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2ibs))]
+pub fn _mm512_mask_ipcvts_ps_epi8(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvts_roundps_epi8::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2ibs))]
+pub fn _mm512_maskz_ipcvts_ps_epi8(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvts_ps_epi8(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits.
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtps2ibs, ROUNDING = 8)
+)]
+pub fn _mm512_ipcvts_roundps_epi8<const ROUNDING: i32>(a: __m512) -> __m512i {
+    _mm512_mask_ipcvts_roundps_epi8::<ROUNDING>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtps2ibs, ROUNDING = 8)
+)]
+pub fn _mm512_mask_ipcvts_roundps_epi8<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    unsafe { vcvtps2ibs512(a.as_f32x16(), src.as_i32x16(), k, ROUNDING).as_m512i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtps2ibs, ROUNDING = 8)
+)]
+pub fn _mm512_maskz_ipcvts_roundps_epi8<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvts_roundps_epi8::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing out the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2iubs))]
+pub fn _mm_ipcvts_ps_epu8(a: __m128) -> __m128i {
+    _mm_mask_ipcvts_ps_epu8(_mm_undefined_si128(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2iubs))]
+pub fn _mm_mask_ipcvts_ps_epu8(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { vcvtps2iubs128(a.as_f32x4(), src.as_u32x4(), k).as_m128i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2iubs))]
+pub fn _mm_maskz_ipcvts_ps_epu8(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_ipcvts_ps_epu8(_mm_setzero_si128(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2iubs))]
+pub fn _mm256_ipcvts_ps_epu8(a: __m256) -> __m256i {
+    _mm256_mask_ipcvts_ps_epu8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2iubs))]
+pub fn _mm256_mask_ipcvts_ps_epu8(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { vcvtps2iubs256(a.as_f32x8(), src.as_u32x8(), k).as_m256i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2iubs))]
+pub fn _mm256_maskz_ipcvts_ps_epu8(k: __mmask8, a: __m256) -> __m256i {
+    _mm256_mask_ipcvts_ps_epu8(_mm256_setzero_si256(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2iubs))]
+pub fn _mm512_ipcvts_ps_epu8(a: __m512) -> __m512i {
+    _mm512_mask_ipcvts_ps_epu8(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2iubs))]
+pub fn _mm512_mask_ipcvts_ps_epu8(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvts_roundps_epu8::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtps2iubs))]
+pub fn _mm512_maskz_ipcvts_ps_epu8(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvts_ps_epu8(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits.
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtps2iubs, ROUNDING = 8)
+)]
+pub fn _mm512_ipcvts_roundps_epu8<const ROUNDING: i32>(a: __m512) -> __m512i {
+    _mm512_mask_ipcvts_roundps_epu8::<ROUNDING>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtps2iubs, ROUNDING = 8)
+)]
+pub fn _mm512_mask_ipcvts_roundps_epu8<const ROUNDING: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
+    unsafe { vcvtps2iubs512(a.as_f32x16(), src.as_u32x16(), k, ROUNDING).as_m512i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation (`NaN` is converted to zero), and store them in the lower bits of corresponding
+/// 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvtps2iubs, ROUNDING = 8)
+)]
+pub fn _mm512_maskz_ipcvts_roundps_epu8<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvts_roundps_epu8::<ROUNDING>(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing out the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2ibs))]
+pub fn _mm_ipcvtts_ph_epi8(a: __m128h) -> __m128i {
+    _mm_mask_ipcvtts_ph_epi8(_mm_undefined_si128(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2ibs))]
+pub fn _mm_mask_ipcvtts_ph_epi8(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { vcvttph2ibs128(a.as_f16x8(), src.as_i16x8(), k).as_m128i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2ibs))]
+pub fn _mm_maskz_ipcvtts_ph_epi8(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_ipcvtts_ph_epi8(_mm_setzero_si128(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2ibs))]
+pub fn _mm256_ipcvtts_ph_epi8(a: __m256h) -> __m256i {
+    _mm256_mask_ipcvtts_ph_epi8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2ibs))]
+pub fn _mm256_mask_ipcvtts_ph_epi8(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { vcvttph2ibs256(a.as_f16x16(), src.as_i16x16(), k).as_m256i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2ibs))]
+pub fn _mm256_maskz_ipcvtts_ph_epi8(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_ipcvtts_ph_epi8(_mm256_setzero_si256(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2ibs))]
+pub fn _mm512_ipcvtts_ph_epi8(a: __m512h) -> __m512i {
+    _mm512_mask_ipcvtts_ph_epi8(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2ibs))]
+pub fn _mm512_mask_ipcvtts_ph_epi8(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvtts_roundph_epi8::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2ibs))]
+pub fn _mm512_maskz_ipcvtts_ph_epi8(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvtts_ph_epi8(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttph2ibs, SAE = 8)
+)]
+pub fn _mm512_ipcvtts_roundph_epi8<const SAE: i32>(a: __m512h) -> __m512i {
+    _mm512_mask_ipcvtts_roundph_epi8::<SAE>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttph2ibs, SAE = 8)
+)]
+pub fn _mm512_mask_ipcvtts_roundph_epi8<const SAE: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttph2ibs512(a.as_f16x32(), src.as_i16x32(), k, SAE).as_m512i() }
+}
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttph2ibs, SAE = 8)
+)]
+pub fn _mm512_maskz_ipcvtts_roundph_epi8<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvtts_roundph_epi8::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2iubs))]
+pub fn _mm_ipcvtts_ph_epu8(a: __m128h) -> __m128i {
+    _mm_mask_ipcvtts_ph_epu8(_mm_undefined_si128(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2iubs))]
+pub fn _mm_mask_ipcvtts_ph_epu8(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { vcvttph2iubs128(a.as_f16x8(), src.as_u16x8(), k).as_m128i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2iubs))]
+pub fn _mm_maskz_ipcvtts_ph_epu8(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_ipcvtts_ph_epu8(_mm_setzero_si128(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2iubs))]
+pub fn _mm256_ipcvtts_ph_epu8(a: __m256h) -> __m256i {
+    _mm256_mask_ipcvtts_ph_epu8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2iubs))]
+pub fn _mm256_mask_ipcvtts_ph_epu8(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { vcvttph2iubs256(a.as_f16x16(), src.as_u16x16(), k).as_m256i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2iubs))]
+pub fn _mm256_maskz_ipcvtts_ph_epu8(k: __mmask16, a: __m256h) -> __m256i {
+    _mm256_mask_ipcvtts_ph_epu8(_mm256_setzero_si256(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2iubs))]
+pub fn _mm512_ipcvtts_ph_epu8(a: __m512h) -> __m512i {
+    _mm512_mask_ipcvtts_ph_epu8(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2iubs))]
+pub fn _mm512_mask_ipcvtts_ph_epu8(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvtts_roundph_epu8::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttph2iubs))]
+pub fn _mm512_maskz_ipcvtts_ph_epu8(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvtts_ph_epu8(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttph2iubs, SAE = 8)
+)]
+pub fn _mm512_ipcvtts_roundph_epu8<const SAE: i32>(a: __m512h) -> __m512i {
+    _mm512_mask_ipcvtts_roundph_epu8::<SAE>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttph2iubs, SAE = 8)
+)]
+pub fn _mm512_mask_ipcvtts_roundph_epu8<const SAE: i32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512h,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttph2iubs512(a.as_f16x32(), src.as_u16x32(), k, SAE).as_m512i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttph2iubs, SAE = 8)
+)]
+pub fn _mm512_maskz_ipcvtts_roundph_epu8<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
+    _mm512_mask_ipcvtts_roundph_epu8::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 16-bit packed element `dst` and zeroing the upper 8 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2ibs))]
+pub fn _mm_ipcvtts_ps_epi8(a: __m128) -> __m128i {
+    _mm_mask_ipcvtts_ps_epi8(_mm_undefined_si128(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2ibs))]
+pub fn _mm_mask_ipcvtts_ps_epi8(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { vcvttps2ibs128(a.as_f32x4(), src.as_i32x4(), k).as_m128i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2ibs))]
+pub fn _mm_maskz_ipcvtts_ps_epi8(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_ipcvtts_ps_epi8(_mm_setzero_si128(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2ibs))]
+pub fn _mm256_ipcvtts_ps_epi8(a: __m256) -> __m256i {
+    _mm256_mask_ipcvtts_ps_epi8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2ibs))]
+pub fn _mm256_mask_ipcvtts_ps_epi8(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { vcvttps2ibs256(a.as_f32x8(), src.as_i32x8(), k).as_m256i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2ibs))]
+pub fn _mm256_maskz_ipcvtts_ps_epi8(k: __mmask8, a: __m256) -> __m256i {
+    _mm256_mask_ipcvtts_ps_epi8(_mm256_setzero_si256(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2ibs))]
+pub fn _mm512_ipcvtts_ps_epi8(a: __m512) -> __m512i {
+    _mm512_mask_ipcvtts_ps_epi8(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2ibs))]
+pub fn _mm512_mask_ipcvtts_ps_epi8(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvtts_roundps_epi8::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2ibs))]
+pub fn _mm512_maskz_ipcvtts_ps_epi8(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvtts_ps_epi8(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2ibs, SAE = 8)
+)]
+pub fn _mm512_ipcvtts_roundps_epi8<const SAE: i32>(a: __m512) -> __m512i {
+    _mm512_mask_ipcvtts_roundps_epi8::<SAE>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2ibs, SAE = 8)
+)]
+pub fn _mm512_mask_ipcvtts_roundps_epi8<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttps2ibs512(a.as_f32x16(), src.as_i32x16(), k, SAE).as_m512i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to signed 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2ibs, SAE = 8)
+)]
+pub fn _mm512_maskz_ipcvtts_roundps_epi8<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvtts_roundps_epi8::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2iubs))]
+pub fn _mm_ipcvtts_ps_epu8(a: __m128) -> __m128i {
+    _mm_mask_ipcvtts_ps_epu8(_mm_undefined_si128(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2iubs))]
+pub fn _mm_mask_ipcvtts_ps_epu8(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { vcvttps2iubs128(a.as_f32x4(), src.as_u32x4(), k).as_m128i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2iubs))]
+pub fn _mm_maskz_ipcvtts_ps_epu8(k: __mmask8, a: __m128) -> __m128i {
+    _mm_mask_ipcvtts_ps_epu8(_mm_setzero_si128(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2iubs))]
+pub fn _mm256_ipcvtts_ps_epu8(a: __m256) -> __m256i {
+    _mm256_mask_ipcvtts_ps_epu8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2iubs))]
+pub fn _mm256_mask_ipcvtts_ps_epu8(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { vcvttps2iubs256(a.as_f32x8(), src.as_u32x8(), k).as_m256i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2iubs))]
+pub fn _mm256_maskz_ipcvtts_ps_epu8(k: __mmask8, a: __m256) -> __m256i {
+    _mm256_mask_ipcvtts_ps_epu8(_mm256_setzero_si256(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2iubs))]
+pub fn _mm512_ipcvtts_ps_epu8(a: __m512) -> __m512i {
+    _mm512_mask_ipcvtts_ps_epu8(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2iubs))]
+pub fn _mm512_mask_ipcvtts_ps_epu8(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvtts_roundps_epu8::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvttps2iubs))]
+pub fn _mm512_maskz_ipcvtts_ps_epu8(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvtts_ps_epu8(_mm512_setzero_si512(), k, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2iubs, SAE = 8)
+)]
+pub fn _mm512_ipcvtts_roundps_epu8<const SAE: i32>(a: __m512) -> __m512i {
+    _mm512_mask_ipcvtts_roundps_epu8::<SAE>(_mm512_undefined_epi32(), !0, a)
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2iubs, SAE = 8)
+)]
+pub fn _mm512_mask_ipcvtts_roundps_epu8<const SAE: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_sae!(SAE);
+    unsafe { vcvttps2iubs512(a.as_f32x16(), src.as_u32x16(), k, SAE).as_m512i() }
+}
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to unsigned 8-bit integers
+/// using saturation and truncation (`NaN` is converted to zero), and store them in the lower bits of
+/// corresponding 32-bit packed element `dst` and zeroing the upper 24 bits, using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the `SAE` parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttps2iubs, SAE = 8)
+)]
+pub fn _mm512_maskz_ipcvtts_roundps_epu8<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    _mm512_mask_ipcvtts_roundps_epu8::<SAE>(_mm512_setzero_si512(), k, a)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ps2phx))]
+pub fn _mm_cvtx2ps_ph(a: __m128, b: __m128) -> __m128h {
+    _mm_mask_cvtx2ps_ph(_mm_undefined_ph(), !0, a, b)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst` using writemask
+/// `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ps2phx))]
+pub fn _mm_mask_cvtx2ps_ph(src: __m128h, k: __mmask8, a: __m128, b: __m128) -> __m128h {
+    unsafe { vcvt2ps2phx128(a.as_f32x4(), b.as_f32x4(), src.as_f16x8(), k).as_m128h() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst` using zeromask
+/// `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ps2phx))]
+pub fn _mm_maskz_cvtx2ps_ph(k: __mmask8, a: __m128, b: __m128) -> __m128h {
+    _mm_mask_cvtx2ps_ph(_mm_setzero_ph(), k, a, b)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ps2phx))]
+pub fn _mm256_cvtx2ps_ph(a: __m256, b: __m256) -> __m256h {
+    _mm256_mask_cvtx2ps_ph(_mm256_undefined_ph(), !0, a, b)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst` using writemask
+/// `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ps2phx))]
+pub fn _mm256_mask_cvtx2ps_ph(src: __m256h, k: __mmask16, a: __m256, b: __m256) -> __m256h {
+    unsafe { vcvt2ps2phx256(a.as_f32x8(), b.as_f32x8(), src.as_f16x16(), k).as_m256h() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst` using zeromask
+/// `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ps2phx))]
+pub fn _mm256_maskz_cvtx2ps_ph(k: __mmask16, a: __m256, b: __m256) -> __m256h {
+    _mm256_mask_cvtx2ps_ph(_mm256_setzero_ph(), k, a, b)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ps2phx))]
+pub fn _mm512_cvtx2ps_ph(a: __m512, b: __m512) -> __m512h {
+    _mm512_mask_cvtx2ps_ph(_mm512_undefined_ph(), !0, a, b)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst` using writemask
+/// `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ps2phx))]
+pub fn _mm512_mask_cvtx2ps_ph(src: __m512h, k: __mmask32, a: __m512, b: __m512) -> __m512h {
+    _mm512_mask_cvtx_round2ps_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst` using zeromask
+/// `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ps2phx))]
+pub fn _mm512_maskz_cvtx2ps_ph(k: __mmask32, a: __m512, b: __m512) -> __m512h {
+    _mm512_mask_cvtx2ps_ph(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst`.
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvt2ps2phx, ROUNDING = 8)
+)]
+pub fn _mm512_cvtx_round2ps_ph<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512h {
+    _mm512_mask_cvtx_round2ps_ph::<ROUNDING>(_mm512_undefined_ph(), !0, a, b)
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst` using writemask
+/// `k` (elements are copied from `src` when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvt2ps2phx, ROUNDING = 8)
+)]
+pub fn _mm512_mask_cvtx_round2ps_ph<const ROUNDING: i32>(
+    src: __m512h,
+    k: __mmask32,
+    a: __m512,
+    b: __m512,
+) -> __m512h {
+    static_assert_rounding!(ROUNDING);
+    unsafe { vcvt2ps2phx512(a.as_f32x16(), b.as_f32x16(), src.as_f16x32(), k, ROUNDING).as_m512h() }
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in `a` and `b` to packed
+/// half-precision (16-bit) floating-point elements, and store the results in `dst` using zeromask
+/// `k` (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the `ROUNDING` parameter, which can be one of:
+/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
+/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
+/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
+/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
+/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvt2ps2phx, ROUNDING = 8)
+)]
+pub fn _mm512_maskz_cvtx_round2ps_ph<const ROUNDING: i32>(
+    k: __mmask32,
+    a: __m512,
+    b: __m512,
+) -> __m512h {
+    _mm512_mask_cvtx_round2ps_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed BF8 (8-bit
+/// E5M2) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a`, and stores the results in `dst`. The upper 64 bits
+/// of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8))]
+pub fn _mm_cvtbiasph_bf8(a: __m128i, b: __m128h) -> __m128i {
+    _mm_mask_cvtbiasph_bf8(_mm_undefined_si128(), !0, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed BF8 (8-bit E5M2)
+/// floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a`, and stores the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set). The upper 64 bits
+/// of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8))]
+pub fn _mm_mask_cvtbiasph_bf8(src: __m128i, k: __mmask8, a: __m128i, b: __m128h) -> __m128i {
+    unsafe { vcvtbiasph2bf8128(a.as_u8x16(), b.as_f16x8(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed BF8 (8-bit
+/// E5M2) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a`, and stores the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of
+/// `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8))]
+pub fn _mm_maskz_cvtbiasph_bf8(k: __mmask8, a: __m128i, b: __m128h) -> __m128i {
+    _mm_mask_cvtbiasph_bf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed BF8 (8-bit
+/// E5M2) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a`, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8))]
+pub fn _mm256_cvtbiasph_bf8(a: __m256i, b: __m256h) -> __m128i {
+    _mm256_mask_cvtbiasph_bf8(_mm_undefined_si128(), !0, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed BF8 (8-bit E5M2)
+/// floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a`, and stores the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8))]
+pub fn _mm256_mask_cvtbiasph_bf8(src: __m128i, k: __mmask16, a: __m256i, b: __m256h) -> __m128i {
+    unsafe { vcvtbiasph2bf8256(a.as_u8x32(), b.as_f16x16(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed BF8 (8-bit
+/// E5M2) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a`, and stores the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8))]
+pub fn _mm256_maskz_cvtbiasph_bf8(k: __mmask16, a: __m256i, b: __m256h) -> __m128i {
+    _mm256_mask_cvtbiasph_bf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed BF8 (8-bit
+/// E5M2) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a`, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8))]
+pub fn _mm512_cvtbiasph_bf8(a: __m512i, b: __m512h) -> __m256i {
+    _mm512_mask_cvtbiasph_bf8(_mm256_undefined_si256(), !0, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed BF8 (8-bit E5M2)
+/// floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a`, and stores the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8))]
+pub fn _mm512_mask_cvtbiasph_bf8(src: __m256i, k: __mmask32, a: __m512i, b: __m512h) -> __m256i {
+    unsafe { vcvtbiasph2bf8512(a.as_u8x64(), b.as_f16x32(), src.as_u8x32(), k).as_m256i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed BF8 (8-bit
+/// E5M2) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a`, and stores the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8))]
+pub fn _mm512_maskz_cvtbiasph_bf8(k: __mmask32, a: __m512i, b: __m512h) -> __m256i {
+    _mm512_mask_cvtbiasph_bf8(_mm256_setzero_si256(), k, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed BF8 (8-bit E5M2) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`.
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8s))]
+pub fn _mm_cvts_biasph_bf8(a: __m128i, b: __m128h) -> __m128i {
+    _mm_mask_cvts_biasph_bf8(_mm_undefined_si128(), !0, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed BF8 (8-bit E5M2) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst `
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8s))]
+pub fn _mm_mask_cvts_biasph_bf8(src: __m128i, k: __mmask8, a: __m128i, b: __m128h) -> __m128i {
+    unsafe { vcvtbiasph2bf8s128(a.as_u8x16(), b.as_f16x8(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed BF8 (8-bit E5M2) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst `
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8s))]
+pub fn _mm_maskz_cvts_biasph_bf8(k: __mmask8, a: __m128i, b: __m128h) -> __m128i {
+    _mm_mask_cvts_biasph_bf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed BF8 (8-bit E5M2) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8s))]
+pub fn _mm256_cvts_biasph_bf8(a: __m256i, b: __m256h) -> __m128i {
+    _mm256_mask_cvts_biasph_bf8(_mm_undefined_si128(), !0, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed BF8 (8-bit E5M2) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst `
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8s))]
+pub fn _mm256_mask_cvts_biasph_bf8(src: __m128i, k: __mmask16, a: __m256i, b: __m256h) -> __m128i {
+    unsafe { vcvtbiasph2bf8s256(a.as_u8x32(), b.as_f16x16(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed BF8 (8-bit E5M2) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst `
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8s))]
+pub fn _mm256_maskz_cvts_biasph_bf8(k: __mmask16, a: __m256i, b: __m256h) -> __m128i {
+    _mm256_mask_cvts_biasph_bf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed BF8 (8-bit E5M2) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8s))]
+pub fn _mm512_cvts_biasph_bf8(a: __m512i, b: __m512h) -> __m256i {
+    _mm512_mask_cvts_biasph_bf8(_mm256_undefined_si256(), !0, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed BF8 (8-bit E5M2) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst `
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8s))]
+pub fn _mm512_mask_cvts_biasph_bf8(src: __m256i, k: __mmask32, a: __m512i, b: __m512h) -> __m256i {
+    unsafe { vcvtbiasph2bf8s512(a.as_u8x64(), b.as_f16x32(), src.as_u8x32(), k).as_m256i() }
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed BF8 (8-bit E5M2) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst `
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2bf8s))]
+pub fn _mm512_maskz_cvts_biasph_bf8(k: __mmask32, a: __m512i, b: __m512h) -> __m256i {
+    _mm512_mask_cvts_biasph_bf8(_mm256_setzero_si256(), k, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed HF8 (8-bit
+/// E4M3) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a` (infinity is converted to `NaN`), and stores the
+/// results in `dst`. The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8))]
+pub fn _mm_cvtbiasph_hf8(a: __m128i, b: __m128h) -> __m128i {
+    _mm_mask_cvtbiasph_hf8(_mm_undefined_si128(), !0, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed HF8 (8-bit
+/// E4M3) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a` (infinity is converted to `NaN`), and stores the
+/// results in `dst` using writemask `k` (elements are copied from `src` when the corresponding
+/// mask bit is not set). The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8))]
+pub fn _mm_mask_cvtbiasph_hf8(src: __m128i, k: __mmask8, a: __m128i, b: __m128h) -> __m128i {
+    unsafe { vcvtbiasph2hf8128(a.as_u8x16(), b.as_f16x8(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed HF8 (8-bit
+/// E4M3) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a` (infinity is converted to `NaN`), and stores the
+/// results in `dst` using zeromask `k` (elements are zeroed out when the corresponding mask bit
+/// is not set). The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8))]
+pub fn _mm_maskz_cvtbiasph_hf8(k: __mmask8, a: __m128i, b: __m128h) -> __m128i {
+    _mm_mask_cvtbiasph_hf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed HF8 (8-bit
+/// E4M3) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a` (infinity is converted to `NaN`), and stores the
+/// results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8))]
+pub fn _mm256_cvtbiasph_hf8(a: __m256i, b: __m256h) -> __m128i {
+    _mm256_mask_cvtbiasph_hf8(_mm_undefined_si128(), !0, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed HF8 (8-bit
+/// E4M3) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a` (infinity is converted to `NaN`), and stores the
+/// results in `dst` using writemask `k` (elements are copied from `src` when the corresponding
+/// mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8))]
+pub fn _mm256_mask_cvtbiasph_hf8(src: __m128i, k: __mmask16, a: __m256i, b: __m256h) -> __m128i {
+    unsafe { vcvtbiasph2hf8256(a.as_u8x32(), b.as_f16x16(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed HF8 (8-bit
+/// E4M3) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a` (infinity is converted to `NaN`), and stores the
+/// results in `dst` using zeromask `k` (elements are zeroed out when the corresponding mask bit
+/// is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8))]
+pub fn _mm256_maskz_cvtbiasph_hf8(k: __mmask16, a: __m256i, b: __m256h) -> __m128i {
+    _mm256_mask_cvtbiasph_hf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed HF8 (8-bit
+/// E4M3) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a` (infinity is converted to `NaN`), and stores the
+/// results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8))]
+pub fn _mm512_cvtbiasph_hf8(a: __m512i, b: __m512h) -> __m256i {
+    _mm512_mask_cvtbiasph_hf8(_mm256_undefined_si256(), !0, a, b)
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed HF8 (8-bit
+/// E4M3) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a` (infinity is converted to `NaN`), and stores the
+/// results in `dst` using writemask `k` (elements are copied from `src` when the corresponding
+/// mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8))]
+pub fn _mm512_mask_cvtbiasph_hf8(src: __m256i, k: __mmask32, a: __m512i, b: __m512h) -> __m256i {
+    unsafe { vcvtbiasph2hf8512(a.as_u8x64(), b.as_f16x32(), src.as_u8x32(), k).as_m256i() }
+}
+
+/// Converts packed half-precision (16-bit) floating-point elements in `b` to packed HF8 (8-bit
+/// E4M3) floating-point elements using unsigned bias values from the lower 8 bits of the
+/// corresponding packed 16-bit element of `a` (infinity is converted to `NaN`), and stores the
+/// results in `dst` using zeromask `k` (elements are zeroed out when the corresponding mask bit
+/// is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8))]
+pub fn _mm512_maskz_cvtbiasph_hf8(k: __mmask32, a: __m512i, b: __m512h) -> __m256i {
+    _mm512_mask_cvtbiasph_hf8(_mm256_setzero_si256(), k, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed HF8 (8-bit E4M3) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`.
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8s))]
+pub fn _mm_cvts_biasph_hf8(a: __m128i, b: __m128h) -> __m128i {
+    _mm_mask_cvts_biasph_hf8(_mm_undefined_si128(), !0, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed HF8 (8-bit E4M3) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8s))]
+pub fn _mm_mask_cvts_biasph_hf8(src: __m128i, k: __mmask8, a: __m128i, b: __m128h) -> __m128i {
+    unsafe { vcvtbiasph2hf8s128(a.as_u8x16(), b.as_f16x8(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed HF8 (8-bit E4M3) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+/// The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8s))]
+pub fn _mm_maskz_cvts_biasph_hf8(k: __mmask8, a: __m128i, b: __m128h) -> __m128i {
+    _mm_mask_cvts_biasph_hf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed HF8 (8-bit E4M3) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8s))]
+pub fn _mm256_cvts_biasph_hf8(a: __m256i, b: __m256h) -> __m128i {
+    _mm256_mask_cvts_biasph_hf8(_mm_undefined_si128(), !0, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed HF8 (8-bit E4M3) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8s))]
+pub fn _mm256_mask_cvts_biasph_hf8(src: __m128i, k: __mmask16, a: __m256i, b: __m256h) -> __m128i {
+    unsafe { vcvtbiasph2hf8s256(a.as_u8x32(), b.as_f16x16(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed HF8 (8-bit E4M3) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8s))]
+pub fn _mm256_maskz_cvts_biasph_hf8(k: __mmask16, a: __m256i, b: __m256h) -> __m128i {
+    _mm256_mask_cvts_biasph_hf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed HF8 (8-bit E4M3) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8s))]
+pub fn _mm512_cvts_biasph_hf8(a: __m512i, b: __m512h) -> __m256i {
+    _mm512_mask_cvts_biasph_hf8(_mm256_undefined_si256(), !0, a, b)
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed HF8 (8-bit E4M3) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`
+/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8s))]
+pub fn _mm512_mask_cvts_biasph_hf8(src: __m256i, k: __mmask32, a: __m512i, b: __m512h) -> __m256i {
+    unsafe { vcvtbiasph2hf8s512(a.as_u8x64(), b.as_f16x32(), src.as_u8x32(), k).as_m256i() }
+}
+
+/// Performs a saturating conversion from packed half-precision (16-bit) floating-point elements in
+/// `b` to packed HF8 (8-bit E4M3) floating-point elements, using unsigned bias values from the
+/// lower 8 bits of the corresponding packed 16-bit element of `a`, and stores the results in `dst`
+/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtbiasph2hf8s))]
+pub fn _mm512_maskz_cvts_biasph_hf8(k: __mmask32, a: __m512i, b: __m512h) -> __m256i {
+    _mm512_mask_cvts_biasph_hf8(_mm256_setzero_si256(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm_cvt2ph_bf8(a: __m128h, b: __m128h) -> __m128i {
+    unsafe { vcvt2ph2bf8128(a.as_f16x8(), b.as_f16x8()).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements, and store the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm_mask_cvt2ph_bf8(src: __m128i, k: __mmask16, a: __m128h, b: __m128h) -> __m128i {
+    unsafe { simd_select_bitmask(k, _mm_cvt2ph_bf8(a, b).as_u8x16(), src.as_u8x16()).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements, and store the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm_maskz_cvt2ph_bf8(k: __mmask16, a: __m128h, b: __m128h) -> __m128i {
+    _mm_mask_cvt2ph_bf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm256_cvt2ph_bf8(a: __m256h, b: __m256h) -> __m256i {
+    unsafe { vcvt2ph2bf8256(a.as_f16x16(), b.as_f16x16()).as_m256i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements, and store the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm256_mask_cvt2ph_bf8(src: __m256i, k: __mmask32, a: __m256h, b: __m256h) -> __m256i {
+    unsafe { simd_select_bitmask(k, _mm256_cvt2ph_bf8(a, b).as_u8x32(), src.as_u8x32()).as_m256i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements, and store the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm256_maskz_cvt2ph_bf8(k: __mmask32, a: __m256h, b: __m256h) -> __m256i {
+    _mm256_mask_cvt2ph_bf8(_mm256_setzero_si256(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm512_cvt2ph_bf8(a: __m512h, b: __m512h) -> __m512i {
+    unsafe { vcvt2ph2bf8512(a.as_f16x32(), b.as_f16x32()).as_m512i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements, and store the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm512_mask_cvt2ph_bf8(src: __m512i, k: __mmask64, a: __m512h, b: __m512h) -> __m512i {
+    unsafe { simd_select_bitmask(k, _mm512_cvt2ph_bf8(a, b).as_u8x64(), src.as_u8x64()).as_m512i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements, and store the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm512_maskz_cvt2ph_bf8(k: __mmask64, a: __m512h, b: __m512h) -> __m512i {
+    _mm512_mask_cvt2ph_bf8(_mm512_setzero_si512(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements using saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm_cvts_2ph_bf8(a: __m128h, b: __m128h) -> __m128i {
+    unsafe { vcvt2ph2bf8s128(a.as_f16x8(), b.as_f16x8()).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements using saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm_mask_cvts_2ph_bf8(src: __m128i, k: __mmask16, a: __m128h, b: __m128h) -> __m128i {
+    unsafe { simd_select_bitmask(k, _mm_cvts_2ph_bf8(a, b).as_u8x16(), src.as_u8x16()).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements using saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm_maskz_cvts_2ph_bf8(k: __mmask16, a: __m128h, b: __m128h) -> __m128i {
+    _mm_mask_cvts_2ph_bf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements using saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm256_cvts_2ph_bf8(a: __m256h, b: __m256h) -> __m256i {
+    unsafe { vcvt2ph2bf8s256(a.as_f16x16(), b.as_f16x16()).as_m256i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements using saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm256_mask_cvts_2ph_bf8(src: __m256i, k: __mmask32, a: __m256h, b: __m256h) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_cvts_2ph_bf8(a, b).as_u8x32(), src.as_u8x32()).as_m256i()
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements using saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm256_maskz_cvts_2ph_bf8(k: __mmask32, a: __m256h, b: __m256h) -> __m256i {
+    _mm256_mask_cvts_2ph_bf8(_mm256_setzero_si256(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements using saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm512_cvts_2ph_bf8(a: __m512h, b: __m512h) -> __m512i {
+    unsafe { vcvt2ph2bf8s512(a.as_f16x32(), b.as_f16x32()).as_m512i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements using saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm512_mask_cvts_2ph_bf8(src: __m512i, k: __mmask64, a: __m512h, b: __m512h) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_cvts_2ph_bf8(a, b).as_u8x64(), src.as_u8x64()).as_m512i()
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed BF8
+/// (8-bit E5M2) floating-point elements using saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2bf8))]
+pub fn _mm512_maskz_cvts_2ph_bf8(k: __mmask64, a: __m512h, b: __m512h) -> __m512i {
+    _mm512_mask_cvts_2ph_bf8(_mm512_setzero_si512(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements (infinity is converted to `NaN`), and store the results in
+/// `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8))]
+pub fn _mm_cvt2ph_hf8(a: __m128h, b: __m128h) -> __m128i {
+    unsafe { vcvt2ph2hf8128(a.as_f16x8(), b.as_f16x8()).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements (infinity is converted to `NaN`), and store the results in
+/// `dst` using writemask `k` (elements are copied from `src` when the corresponding mask bit is not
+/// set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8))]
+pub fn _mm_mask_cvt2ph_hf8(src: __m128i, k: __mmask16, a: __m128h, b: __m128h) -> __m128i {
+    unsafe { simd_select_bitmask(k, _mm_cvt2ph_hf8(a, b).as_u8x16(), src.as_u8x16()).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements (infinity is converted to `NaN`), and store the results in
+/// `dst` using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8))]
+pub fn _mm_maskz_cvt2ph_hf8(k: __mmask16, a: __m128h, b: __m128h) -> __m128i {
+    _mm_mask_cvt2ph_hf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements (infinity is converted to `NaN`), and store the results in
+/// `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8))]
+pub fn _mm256_cvt2ph_hf8(a: __m256h, b: __m256h) -> __m256i {
+    unsafe { vcvt2ph2hf8256(a.as_f16x16(), b.as_f16x16()).as_m256i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements (infinity is converted to `NaN`), and store the results in
+/// `dst` using writemask `k` (elements are copied from `src` when the corresponding mask bit is not
+/// set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8))]
+pub fn _mm256_mask_cvt2ph_hf8(src: __m256i, k: __mmask32, a: __m256h, b: __m256h) -> __m256i {
+    unsafe { simd_select_bitmask(k, _mm256_cvt2ph_hf8(a, b).as_u8x32(), src.as_u8x32()).as_m256i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements (infinity is converted to `NaN`), and store the results in
+/// `dst` using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8))]
+pub fn _mm256_maskz_cvt2ph_hf8(k: __mmask32, a: __m256h, b: __m256h) -> __m256i {
+    _mm256_mask_cvt2ph_hf8(_mm256_setzero_si256(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements (infinity is converted to `NaN`), and store the results in
+/// `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8))]
+pub fn _mm512_cvt2ph_hf8(a: __m512h, b: __m512h) -> __m512i {
+    unsafe { vcvt2ph2hf8512(a.as_f16x32(), b.as_f16x32()).as_m512i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements (infinity is converted to `NaN`), and store the results in
+/// `dst` using writemask `k` (elements are copied from `src` when the corresponding mask bit is not
+/// set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8))]
+pub fn _mm512_mask_cvt2ph_hf8(src: __m512i, k: __mmask64, a: __m512h, b: __m512h) -> __m512i {
+    unsafe { simd_select_bitmask(k, _mm512_cvt2ph_hf8(a, b).as_u8x64(), src.as_u8x64()).as_m512i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements (infinity is converted to `NaN`), and store the results in
+/// `dst` using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8))]
+pub fn _mm512_maskz_cvt2ph_hf8(k: __mmask64, a: __m512h, b: __m512h) -> __m512i {
+    _mm512_mask_cvt2ph_hf8(_mm512_setzero_si512(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements using saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8s))]
+pub fn _mm_cvts_2ph_hf8(a: __m128h, b: __m128h) -> __m128i {
+    unsafe { vcvt2ph2hf8s128(a.as_f16x8(), b.as_f16x8()).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements using saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8s))]
+pub fn _mm_mask_cvts_2ph_hf8(src: __m128i, k: __mmask16, a: __m128h, b: __m128h) -> __m128i {
+    unsafe { simd_select_bitmask(k, _mm_cvts_2ph_hf8(a, b).as_u8x16(), src.as_u8x16()).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements using saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8s))]
+pub fn _mm_maskz_cvts_2ph_hf8(k: __mmask16, a: __m128h, b: __m128h) -> __m128i {
+    _mm_mask_cvts_2ph_hf8(_mm_setzero_si128(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements using saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8s))]
+pub fn _mm256_cvts_2ph_hf8(a: __m256h, b: __m256h) -> __m256i {
+    unsafe { vcvt2ph2hf8s256(a.as_f16x16(), b.as_f16x16()).as_m256i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements using saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8s))]
+pub fn _mm256_mask_cvts_2ph_hf8(src: __m256i, k: __mmask32, a: __m256h, b: __m256h) -> __m256i {
+    unsafe {
+        simd_select_bitmask(k, _mm256_cvts_2ph_hf8(a, b).as_u8x32(), src.as_u8x32()).as_m256i()
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements using saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8s))]
+pub fn _mm256_maskz_cvts_2ph_hf8(k: __mmask32, a: __m256h, b: __m256h) -> __m256i {
+    _mm256_mask_cvts_2ph_hf8(_mm256_setzero_si256(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements using saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8s))]
+pub fn _mm512_cvts_2ph_hf8(a: __m512h, b: __m512h) -> __m512i {
+    unsafe { vcvt2ph2hf8s512(a.as_f16x32(), b.as_f16x32()).as_m512i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements using saturation, and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8s))]
+pub fn _mm512_mask_cvts_2ph_hf8(src: __m512i, k: __mmask64, a: __m512h, b: __m512h) -> __m512i {
+    unsafe {
+        simd_select_bitmask(k, _mm512_cvts_2ph_hf8(a, b).as_u8x64(), src.as_u8x64()).as_m512i()
+    }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` and `b` to packed HF8
+/// (8-bit E4M3) floating-point elements using saturation, and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvt2ph2hf8s))]
+pub fn _mm512_maskz_cvts_2ph_hf8(k: __mmask64, a: __m512h, b: __m512h) -> __m512i {
+    _mm512_mask_cvts_2ph_hf8(_mm512_setzero_si512(), k, a, b)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements, and store the results in `dst`. The upper 64 bits of `dst` are zeroed
+/// out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8))]
+pub fn _mm_cvtph_bf8(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_bf8(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements, and store the results in `dst` using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set). The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8))]
+pub fn _mm_mask_cvtph_bf8(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { vcvtph2bf8128(a.as_f16x8(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements, and store the results in `dst` using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set). The upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8))]
+pub fn _mm_maskz_cvtph_bf8(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_bf8(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8))]
+pub fn _mm256_cvtph_bf8(a: __m256h) -> __m128i {
+    _mm256_mask_cvtph_bf8(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements, and store the results in `dst` using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8))]
+pub fn _mm256_mask_cvtph_bf8(src: __m128i, k: __mmask16, a: __m256h) -> __m128i {
+    unsafe { vcvtph2bf8256(a.as_f16x16(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements, and store the results in `dst` using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8))]
+pub fn _mm256_maskz_cvtph_bf8(k: __mmask16, a: __m256h) -> __m128i {
+    _mm256_mask_cvtph_bf8(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8))]
+pub fn _mm512_cvtph_bf8(a: __m512h) -> __m256i {
+    _mm512_mask_cvtph_bf8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements, and store the results in `dst` using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8))]
+pub fn _mm512_mask_cvtph_bf8(src: __m256i, k: __mmask32, a: __m512h) -> __m256i {
+    unsafe { vcvtph2bf8512(a.as_f16x32(), src.as_u8x32(), k).as_m256i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements, and store the results in `dst` using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8))]
+pub fn _mm512_maskz_cvtph_bf8(k: __mmask32, a: __m512h) -> __m256i {
+    _mm512_mask_cvtph_bf8(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements using saturation, and store the results in `dst`. The upper 64 bits of
+/// `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8s))]
+pub fn _mm_cvts_ph_bf8(a: __m128h) -> __m128i {
+    _mm_mask_cvts_ph_bf8(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements using saturation, and store the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set). The upper 64 bits
+/// of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8s))]
+pub fn _mm_mask_cvts_ph_bf8(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { vcvtph2bf8s128(a.as_f16x8(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements using saturation, and store the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of
+/// `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8s))]
+pub fn _mm_maskz_cvts_ph_bf8(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvts_ph_bf8(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements using saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8s))]
+pub fn _mm256_cvts_ph_bf8(a: __m256h) -> __m128i {
+    _mm256_mask_cvts_ph_bf8(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements using saturation, and store the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8s))]
+pub fn _mm256_mask_cvts_ph_bf8(src: __m128i, k: __mmask16, a: __m256h) -> __m128i {
+    unsafe { vcvtph2bf8s256(a.as_f16x16(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements using saturation, and store the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8s))]
+pub fn _mm256_maskz_cvts_ph_bf8(k: __mmask16, a: __m256h) -> __m128i {
+    _mm256_mask_cvts_ph_bf8(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements using saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8s))]
+pub fn _mm512_cvts_ph_bf8(a: __m512h) -> __m256i {
+    _mm512_mask_cvts_ph_bf8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements using saturation, and store the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8s))]
+pub fn _mm512_mask_cvts_ph_bf8(src: __m256i, k: __mmask32, a: __m512h) -> __m256i {
+    unsafe { vcvtph2bf8s512(a.as_f16x32(), src.as_u8x32(), k).as_m256i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed BF8 (8-bit E5M2)
+/// floating-point elements using saturation, and store the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2bf8s))]
+pub fn _mm512_maskz_cvts_ph_bf8(k: __mmask32, a: __m512h) -> __m256i {
+    _mm512_mask_cvts_ph_bf8(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements (infinity is converted to `NaN`), and store the results in `dst`. The
+/// upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8))]
+pub fn _mm_cvtph_hf8(a: __m128h) -> __m128i {
+    _mm_mask_cvtph_hf8(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements (infinity is converted to `NaN`), and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set). The
+/// upper 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8))]
+pub fn _mm_mask_cvtph_hf8(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { vcvtph2hf8128(a.as_f16x8(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements (infinity is converted to `NaN`), and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set). The upper
+/// 64 bits of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8))]
+pub fn _mm_maskz_cvtph_hf8(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvtph_hf8(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements (infinity is converted to `NaN`), and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8))]
+pub fn _mm256_cvtph_hf8(a: __m256h) -> __m128i {
+    _mm256_mask_cvtph_hf8(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements (infinity is converted to `NaN`), and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8))]
+pub fn _mm256_mask_cvtph_hf8(src: __m128i, k: __mmask16, a: __m256h) -> __m128i {
+    unsafe { vcvtph2hf8256(a.as_f16x16(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements (infinity is converted to `NaN`), and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8))]
+pub fn _mm256_maskz_cvtph_hf8(k: __mmask16, a: __m256h) -> __m128i {
+    _mm256_mask_cvtph_hf8(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements (infinity is converted to `NaN`), and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8))]
+pub fn _mm512_cvtph_hf8(a: __m512h) -> __m256i {
+    _mm512_mask_cvtph_hf8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements (infinity is converted to `NaN`), and store the results in `dst` using
+/// writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8))]
+pub fn _mm512_mask_cvtph_hf8(src: __m256i, k: __mmask32, a: __m512h) -> __m256i {
+    unsafe { vcvtph2hf8512(a.as_f16x32(), src.as_u8x32(), k).as_m256i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements (infinity is converted to `NaN`), and store the results in `dst` using
+/// zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8))]
+pub fn _mm512_maskz_cvtph_hf8(k: __mmask32, a: __m512h) -> __m256i {
+    _mm512_mask_cvtph_hf8(_mm256_setzero_si256(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements using saturation, and store the results in `dst`. The upper 64 bits of
+/// `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8s))]
+pub fn _mm_cvts_ph_hf8(a: __m128h) -> __m128i {
+    _mm_mask_cvts_ph_hf8(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements using saturation, and store the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set). The upper 64 bits
+/// of `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8s))]
+pub fn _mm_mask_cvts_ph_hf8(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { vcvtph2hf8s128(a.as_f16x8(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements using saturation, and store the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of
+/// `dst` are zeroed out.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8s))]
+pub fn _mm_maskz_cvts_ph_hf8(k: __mmask8, a: __m128h) -> __m128i {
+    _mm_mask_cvts_ph_hf8(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements using saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8s))]
+pub fn _mm256_cvts_ph_hf8(a: __m256h) -> __m128i {
+    _mm256_mask_cvts_ph_hf8(_mm_undefined_si128(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements using saturation, and store the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8s))]
+pub fn _mm256_mask_cvts_ph_hf8(src: __m128i, k: __mmask16, a: __m256h) -> __m128i {
+    unsafe { vcvtph2hf8s256(a.as_f16x16(), src.as_u8x16(), k).as_m128i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements using saturation, and store the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8s))]
+pub fn _mm256_maskz_cvts_ph_hf8(k: __mmask16, a: __m256h) -> __m128i {
+    _mm256_mask_cvts_ph_hf8(_mm_setzero_si128(), k, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements using saturation, and store the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8s))]
+pub fn _mm512_cvts_ph_hf8(a: __m512h) -> __m256i {
+    _mm512_mask_cvts_ph_hf8(_mm256_undefined_si256(), !0, a)
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements using saturation, and store the results in `dst` using writemask `k`
+/// (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8s))]
+pub fn _mm512_mask_cvts_ph_hf8(src: __m256i, k: __mmask32, a: __m512h) -> __m256i {
+    unsafe { vcvtph2hf8s512(a.as_f16x32(), src.as_u8x32(), k).as_m256i() }
+}
+
+/// Convert packed half-precision (16-bit) floating-point elements in `a` to packed HF8 (8-bit E4M3)
+/// floating-point elements using saturation, and store the results in `dst` using zeromask `k`
+/// (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvtph2hf8s))]
+pub fn _mm512_maskz_cvts_ph_hf8(k: __mmask32, a: __m512h) -> __m256i {
+    _mm512_mask_cvts_ph_hf8(_mm256_setzero_si256(), k, a)
+}
+
+/// Converts packed BF8 (8-bit E5M2) floating-point elements from the lower 64 bits of `a` to packed
+/// half-precision (16-bit) floating-point elements, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm_cvtbf8_ph(a: __m128i) -> __m128h {
+    _mm_castsi128_ph(_mm_slli_epi16::<8>(_mm_cvtepi8_epi16(a)))
+}
+
+/// Converts packed BF8 (8-bit E5M2) floating-point elements from the lower 64 bits of `a` to packed
+/// half-precision (16-bit) floating-point elements, and stores the results in `dst` using writemask
+/// `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm_mask_cvtbf8_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    _mm_castsi128_ph(_mm_mask_slli_epi16::<8>(
+        _mm_castph_si128(src),
+        k,
+        _mm_cvtepi8_epi16(a),
+    ))
+}
+
+/// Converts packed BF8 (8-bit E5M2) floating-point elements from the lower 64 bits of `a` to packed
+/// half-precision (16-bit) floating-point elements, and stores the results in `dst` using zeromask
+/// `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(test, assert_instr(vpunpcklbw))]
+pub fn _mm_maskz_cvtbf8_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_castsi128_ph(_mm_maskz_slli_epi16::<8>(k, _mm_cvtepi8_epi16(a)))
+}
+
+/// Converts packed BF8 (8-bit E5M2) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm256_cvtbf8_ph(a: __m128i) -> __m256h {
+    _mm256_castsi256_ph(_mm256_slli_epi16::<8>(_mm256_cvtepi8_epi16(a)))
+}
+
+/// Converts packed BF8 (8-bit E5M2) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst` using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm256_mask_cvtbf8_ph(src: __m256h, k: __mmask16, a: __m128i) -> __m256h {
+    _mm256_castsi256_ph(_mm256_mask_slli_epi16::<8>(
+        _mm256_castph_si256(src),
+        k,
+        _mm256_cvtepi8_epi16(a),
+    ))
+}
+
+/// Converts packed BF8 (8-bit E5M2) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst` using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm256_maskz_cvtbf8_ph(k: __mmask16, a: __m128i) -> __m256h {
+    _mm256_castsi256_ph(_mm256_maskz_slli_epi16::<8>(k, _mm256_cvtepi8_epi16(a)))
+}
+
+/// Converts packed BF8 (8-bit E5M2) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm512_cvtbf8_ph(a: __m256i) -> __m512h {
+    _mm512_castsi512_ph(_mm512_slli_epi16::<8>(_mm512_cvtepi8_epi16(a)))
+}
+
+/// Converts packed BF8 (8-bit E5M2) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst` using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm512_mask_cvtbf8_ph(src: __m512h, k: __mmask32, a: __m256i) -> __m512h {
+    _mm512_castsi512_ph(_mm512_mask_slli_epi16::<8>(
+        _mm512_castph_si512(src),
+        k,
+        _mm512_cvtepi8_epi16(a),
+    ))
+}
+
+/// Converts packed BF8 (8-bit E5M2) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst` using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(test, assert_instr(vpsllw))]
+pub fn _mm512_maskz_cvtbf8_ph(k: __mmask32, a: __m256i) -> __m512h {
+    _mm512_castsi512_ph(_mm512_maskz_slli_epi16::<8>(k, _mm512_cvtepi8_epi16(a)))
+}
+
+/// Converts packed HF8 (8-bit E4M3) floating-point elements from the lower 64 bits of `a` to packed
+/// half-precision (16-bit) floating-point elements, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvthf82ph))]
+pub fn _mm_cvthf8_ph(a: __m128i) -> __m128h {
+    _mm_mask_cvthf8_ph(_mm_undefined_ph(), !0, a)
+}
+
+/// Converts packed HF8 (8-bit E4M3) floating-point elements from the lower 64 bits of `a` to packed
+/// half-precision (16-bit) floating-point elements, and stores the results in `dst` using writemask
+/// `k` (elements are copied from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvthf82ph))]
+pub fn _mm_mask_cvthf8_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvthf82ph128(a.as_u8x16(), src.as_f16x8(), k).as_m128h() }
+}
+
+/// Converts packed HF8 (8-bit E4M3) floating-point elements from the lower 64 bits of `a` to packed
+/// half-precision (16-bit) floating-point elements, and stores the results in `dst` using zeromask
+/// `k` (elements are zeroed out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvthf82ph))]
+pub fn _mm_maskz_cvthf8_ph(k: __mmask8, a: __m128i) -> __m128h {
+    _mm_mask_cvthf8_ph(_mm_setzero_ph(), k, a)
+}
+
+/// Converts packed HF8 (8-bit E4M3) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvthf82ph))]
+pub fn _mm256_cvthf8_ph(a: __m128i) -> __m256h {
+    _mm256_mask_cvthf8_ph(_mm256_undefined_ph(), !0, a)
+}
+
+/// Converts packed HF8 (8-bit E4M3) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst` using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvthf82ph))]
+pub fn _mm256_mask_cvthf8_ph(src: __m256h, k: __mmask16, a: __m128i) -> __m256h {
+    unsafe { vcvthf82ph256(a.as_u8x16(), src.as_f16x16(), k).as_m256h() }
+}
+
+/// Converts packed HF8 (8-bit E4M3) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst` using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvthf82ph))]
+pub fn _mm256_maskz_cvthf8_ph(k: __mmask16, a: __m128i) -> __m256h {
+    _mm256_mask_cvthf8_ph(_mm256_setzero_ph(), k, a)
+}
+
+/// Converts packed HF8 (8-bit E4M3) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst`.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvthf82ph))]
+pub fn _mm512_cvthf8_ph(a: __m256i) -> __m512h {
+    _mm512_mask_cvthf8_ph(_mm512_undefined_ph(), !0, a)
+}
+
+/// Converts packed HF8 (8-bit E4M3) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst` using writemask `k` (elements are copied
+/// from `src` when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvthf82ph))]
+pub fn _mm512_mask_cvthf8_ph(src: __m512h, k: __mmask32, a: __m256i) -> __m512h {
+    unsafe { vcvthf82ph512(a.as_u8x32(), src.as_f16x32(), k).as_m512h() }
+}
+
+/// Converts packed HF8 (8-bit E4M3) floating-point elements in `a` to packed half-precision (16-bit)
+/// floating-point elements, and stores the results in `dst` using zeromask `k` (elements are zeroed
+/// out when the corresponding mask bit is not set).
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vcvthf82ph))]
+pub fn _mm512_maskz_cvthf8_ph(k: __mmask32, a: __m256i) -> __m512h {
+    _mm512_mask_cvthf8_ph(_mm512_setzero_ph(), k, a)
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.avx10.vmpsadbw.512"]
+    fn vmpsadbw512(a: u8x64, b: u8x64, imm8: i8) -> u16x32;
+
+    #[link_name = "llvm.x86.avx10.vdpphps.128"]
+    fn vdpphps128(src: f32x4, a: f16x8, b: f16x8) -> f32x4;
+    #[link_name = "llvm.x86.avx10.vdpphps.256"]
+    fn vdpphps256(src: f32x8, a: f16x16, b: f16x16) -> f32x8;
+    #[link_name = "llvm.x86.avx10.vdpphps.512"]
+    fn vdpphps512(src: f32x16, a: f16x32, b: f16x32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx10.vpdpbssd.512"]
+    fn vdpbssd512(src: i32x16, a: i8x64, b: i8x64) -> i32x16;
+    #[link_name = "llvm.x86.avx10.vpdpbssds.512"]
+    fn vdpbssds512(src: i32x16, a: i8x64, b: i8x64) -> i32x16;
+    #[link_name = "llvm.x86.avx10.vpdpbsud.512"]
+    fn vdpbsud512(src: i32x16, a: i8x64, b: u8x64) -> i32x16;
+    #[link_name = "llvm.x86.avx10.vpdpbsuds.512"]
+    fn vdpbsuds512(src: i32x16, a: i8x64, b: u8x64) -> i32x16;
+    #[link_name = "llvm.x86.avx10.vpdpbuud.512"]
+    fn vdpbuud512(src: i32x16, a: u8x64, b: u8x64) -> i32x16;
+    #[link_name = "llvm.x86.avx10.vpdpbuuds.512"]
+    fn vdpbuuds512(src: i32x16, a: u8x64, b: u8x64) -> i32x16;
+
+    #[link_name = "llvm.x86.avx10.vpdpwsud.512"]
+    fn vdpwsud512(src: i32x16, a: i16x32, b: u16x32) -> i32x16;
+    #[link_name = "llvm.x86.avx10.vpdpwsuds.512"]
+    fn vdpwsuds512(src: i32x16, a: i16x32, b: u16x32) -> i32x16;
+    #[link_name = "llvm.x86.avx10.vpdpwusd.512"]
+    fn vdpwusd512(src: i32x16, a: u16x32, b: i16x32) -> i32x16;
+    #[link_name = "llvm.x86.avx10.vpdpwusds.512"]
+    fn vdpwusds512(src: i32x16, a: u16x32, b: i16x32) -> i32x16;
+    #[link_name = "llvm.x86.avx10.vpdpwuud.512"]
+    fn vdpwuud512(src: i32x16, a: u16x32, b: u16x32) -> i32x16;
+    #[link_name = "llvm.x86.avx10.vpdpwuuds.512"]
+    fn vdpwuuds512(src: i32x16, a: u16x32, b: u16x32) -> i32x16;
+
+    #[link_name = "llvm.x86.avx10.vcvttss2sis"]
+    fn vcvttss2sis(a: f32x4, sae: i32) -> i32;
+    #[link_name = "llvm.x86.avx10.vcvttss2usis"]
+    fn vcvttss2usis(a: f32x4, sae: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx10.vcvttsd2sis"]
+    fn vcvttsd2sis(a: f64x2, sae: i32) -> i32;
+    #[link_name = "llvm.x86.avx10.vcvttsd2usis"]
+    fn vcvttsd2usis(a: f64x2, sae: i32) -> u32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2dqs.128"]
+    fn vcvttpd2dqs_128(a: f64x2, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2dqs.256"]
+    fn vcvttpd2dqs_256(a: f64x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2dqs.round.512"]
+    fn vcvttpd2dqs_512(a: f64x8, src: i32x8, mask: u8, sae: i32) -> i32x8;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2udqs.128"]
+    fn vcvttpd2udqs_128(a: f64x2, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2udqs.256"]
+    fn vcvttpd2udqs_256(a: f64x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2udqs.round.512"]
+    fn vcvttpd2udqs_512(a: f64x8, src: u32x8, mask: u8, sae: i32) -> u32x8;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2qqs.128"]
+    fn vcvttpd2qqs_128(a: f64x2, src: i64x2, mask: u8) -> i64x2;
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2qqs.256"]
+    fn vcvttpd2qqs_256(a: f64x4, src: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2qqs.round.512"]
+    fn vcvttpd2qqs_512(a: f64x8, src: i64x8, mask: u8, sae: i32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2uqqs.128"]
+    fn vcvttpd2uqqs_128(a: f64x2, src: u64x2, mask: u8) -> u64x2;
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2uqqs.256"]
+    fn vcvttpd2uqqs_256(a: f64x4, src: u64x4, mask: u8) -> u64x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttpd2uqqs.round.512"]
+    fn vcvttpd2uqqs_512(a: f64x8, src: u64x8, mask: u8, sae: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2dqs.128"]
+    fn vcvttps2dqs_128(a: f32x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2dqs.256"]
+    fn vcvttps2dqs_256(a: f32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2dqs.round.512"]
+    fn vcvttps2dqs_512(a: f32x16, src: i32x16, mask: u16, sae: i32) -> i32x16;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2udqs.128"]
+    fn vcvttps2udqs_128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2udqs.256"]
+    fn vcvttps2udqs_256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2udqs.round.512"]
+    fn vcvttps2udqs_512(a: f32x16, src: u32x16, mask: u16, sae: i32) -> u32x16;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2qqs.128"]
+    fn vcvttps2qqs_128(a: f32x4, src: i64x2, mask: u8) -> i64x2;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2qqs.256"]
+    fn vcvttps2qqs_256(a: f32x4, src: i64x4, mask: u8) -> i64x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2qqs.round.512"]
+    fn vcvttps2qqs_512(a: f32x8, src: i64x8, mask: u8, sae: i32) -> i64x8;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2uqqs.128"]
+    fn vcvttps2uqqs_128(a: f32x4, src: u64x2, mask: u8) -> u64x2;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2uqqs.256"]
+    fn vcvttps2uqqs_256(a: f32x4, src: u64x4, mask: u8) -> u64x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2uqqs.round.512"]
+    fn vcvttps2uqqs_512(a: f32x8, src: u64x8, mask: u8, sae: i32) -> u64x8;
+
+    #[link_name = "llvm.x86.avx10.mask.vminmaxpd128"]
+    fn vminmaxpd128(a: f64x2, b: f64x2, imm8: i32, src: f64x2, k: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx10.mask.vminmaxpd256"]
+    fn vminmaxpd256(a: f64x4, b: f64x4, imm8: i32, src: f64x4, k: u8) -> f64x4;
+    #[link_name = "llvm.x86.avx10.mask.vminmaxpd.round"]
+    fn vminmaxpd512(a: f64x8, b: f64x8, imm8: i32, src: f64x8, k: u8, sae: i32) -> f64x8;
+
+    #[link_name = "llvm.x86.avx10.mask.vminmaxps128"]
+    fn vminmaxps128(a: f32x4, b: f32x4, imm8: i32, src: f32x4, k: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx10.mask.vminmaxps256"]
+    fn vminmaxps256(a: f32x8, b: f32x8, imm8: i32, src: f32x8, k: u8) -> f32x8;
+    #[link_name = "llvm.x86.avx10.mask.vminmaxps.round"]
+    fn vminmaxps512(a: f32x16, b: f32x16, imm8: i32, src: f32x16, k: u16, sae: i32) -> f32x16;
+
+    #[link_name = "llvm.x86.avx10.mask.vminmaxph128"]
+    fn vminmaxph128(a: f16x8, b: f16x8, imm8: i32, src: f16x8, k: u8) -> f16x8;
+    #[link_name = "llvm.x86.avx10.mask.vminmaxph256"]
+    fn vminmaxph256(a: f16x16, b: f16x16, imm8: i32, src: f16x16, k: u16) -> f16x16;
+    #[link_name = "llvm.x86.avx10.mask.vminmaxph.round"]
+    fn vminmaxph512(a: f16x32, b: f16x32, imm8: i32, src: f16x32, k: u32, sae: i32) -> f16x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vminmaxsd.round"]
+    fn vminmaxsd(a: f64x2, b: f64x2, imm8: i32, src: f64x2, k: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx10.mask.vminmaxss.round"]
+    fn vminmaxss(a: f32x4, b: f32x4, imm8: i32, src: f32x4, k: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx10.mask.vminmaxsh.round"]
+    fn vminmaxsh(a: f16x8, b: f16x8, imm8: i32, src: f16x8, k: u8, sae: i32) -> f16x8;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2ibs128"]
+    fn vcvtph2ibs128(a: f16x8, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2ibs256"]
+    fn vcvtph2ibs256(a: f16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2ibs512"]
+    fn vcvtph2ibs512(a: f16x32, src: i16x32, mask: u32, rounding: i32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2iubs128"]
+    fn vcvtph2iubs128(a: f16x8, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2iubs256"]
+    fn vcvtph2iubs256(a: f16x16, src: u16x16, mask: u16) -> u16x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2iubs512"]
+    fn vcvtph2iubs512(a: f16x32, src: u16x32, mask: u32, rounding: i32) -> u16x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtps2ibs128"]
+    fn vcvtps2ibs128(a: f32x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvtps2ibs256"]
+    fn vcvtps2ibs256(a: f32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvtps2ibs512"]
+    fn vcvtps2ibs512(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtps2iubs128"]
+    fn vcvtps2iubs128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvtps2iubs256"]
+    fn vcvtps2iubs256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvtps2iubs512"]
+    fn vcvtps2iubs512(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttph2ibs128"]
+    fn vcvttph2ibs128(a: f16x8, src: i16x8, mask: u8) -> i16x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvttph2ibs256"]
+    fn vcvttph2ibs256(a: f16x16, src: i16x16, mask: u16) -> i16x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvttph2ibs512"]
+    fn vcvttph2ibs512(a: f16x32, src: i16x32, mask: u32, sae: i32) -> i16x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttph2iubs128"]
+    fn vcvttph2iubs128(a: f16x8, src: u16x8, mask: u8) -> u16x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvttph2iubs256"]
+    fn vcvttph2iubs256(a: f16x16, src: u16x16, mask: u16) -> u16x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvttph2iubs512"]
+    fn vcvttph2iubs512(a: f16x32, src: u16x32, mask: u32, sae: i32) -> u16x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2ibs128"]
+    fn vcvttps2ibs128(a: f32x4, src: i32x4, mask: u8) -> i32x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2ibs256"]
+    fn vcvttps2ibs256(a: f32x8, src: i32x8, mask: u8) -> i32x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2ibs512"]
+    fn vcvttps2ibs512(a: f32x16, src: i32x16, mask: u16, sae: i32) -> i32x16;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2iubs128"]
+    fn vcvttps2iubs128(a: f32x4, src: u32x4, mask: u8) -> u32x4;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2iubs256"]
+    fn vcvttps2iubs256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvttps2iubs512"]
+    fn vcvttps2iubs512(a: f32x16, src: u32x16, mask: u16, sae: i32) -> u32x16;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvt2ps2phx.128"]
+    fn vcvt2ps2phx128(a: f32x4, b: f32x4, src: f16x8, mask: u8) -> f16x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvt2ps2phx.256"]
+    fn vcvt2ps2phx256(a: f32x8, b: f32x8, src: f16x16, mask: u16) -> f16x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvt2ps2phx.512"]
+    fn vcvt2ps2phx512(a: f32x16, b: f32x16, src: f16x32, mask: u32, rounding: i32) -> f16x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2bf8128"]
+    fn vcvtbiasph2bf8128(a: u8x16, b: f16x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2bf8256"]
+    fn vcvtbiasph2bf8256(a: u8x32, b: f16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2bf8512"]
+    fn vcvtbiasph2bf8512(a: u8x64, b: f16x32, src: u8x32, mask: u32) -> u8x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2bf8s128"]
+    fn vcvtbiasph2bf8s128(a: u8x16, b: f16x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2bf8s256"]
+    fn vcvtbiasph2bf8s256(a: u8x32, b: f16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2bf8s512"]
+    fn vcvtbiasph2bf8s512(a: u8x64, b: f16x32, src: u8x32, mask: u32) -> u8x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2hf8128"]
+    fn vcvtbiasph2hf8128(a: u8x16, b: f16x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2hf8256"]
+    fn vcvtbiasph2hf8256(a: u8x32, b: f16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2hf8512"]
+    fn vcvtbiasph2hf8512(a: u8x64, b: f16x32, src: u8x32, mask: u32) -> u8x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2hf8s128"]
+    fn vcvtbiasph2hf8s128(a: u8x16, b: f16x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2hf8s256"]
+    fn vcvtbiasph2hf8s256(a: u8x32, b: f16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtbiasph2hf8s512"]
+    fn vcvtbiasph2hf8s512(a: u8x64, b: f16x32, src: u8x32, mask: u32) -> u8x32;
+
+    #[link_name = "llvm.x86.avx10.vcvt2ph2bf8128"]
+    fn vcvt2ph2bf8128(a: f16x8, b: f16x8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.vcvt2ph2bf8256"]
+    fn vcvt2ph2bf8256(a: f16x16, b: f16x16) -> u8x32;
+    #[link_name = "llvm.x86.avx10.vcvt2ph2bf8512"]
+    fn vcvt2ph2bf8512(a: f16x32, b: f16x32) -> u8x64;
+
+    #[link_name = "llvm.x86.avx10.vcvt2ph2bf8s128"]
+    fn vcvt2ph2bf8s128(a: f16x8, b: f16x8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.vcvt2ph2bf8s256"]
+    fn vcvt2ph2bf8s256(a: f16x16, b: f16x16) -> u8x32;
+    #[link_name = "llvm.x86.avx10.vcvt2ph2bf8s512"]
+    fn vcvt2ph2bf8s512(a: f16x32, b: f16x32) -> u8x64;
+
+    #[link_name = "llvm.x86.avx10.vcvt2ph2hf8128"]
+    fn vcvt2ph2hf8128(a: f16x8, b: f16x8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.vcvt2ph2hf8256"]
+    fn vcvt2ph2hf8256(a: f16x16, b: f16x16) -> u8x32;
+    #[link_name = "llvm.x86.avx10.vcvt2ph2hf8512"]
+    fn vcvt2ph2hf8512(a: f16x32, b: f16x32) -> u8x64;
+
+    #[link_name = "llvm.x86.avx10.vcvt2ph2hf8s128"]
+    fn vcvt2ph2hf8s128(a: f16x8, b: f16x8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.vcvt2ph2hf8s256"]
+    fn vcvt2ph2hf8s256(a: f16x16, b: f16x16) -> u8x32;
+    #[link_name = "llvm.x86.avx10.vcvt2ph2hf8s512"]
+    fn vcvt2ph2hf8s512(a: f16x32, b: f16x32) -> u8x64;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2bf8128"]
+    fn vcvtph2bf8128(a: f16x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2bf8256"]
+    fn vcvtph2bf8256(a: f16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2bf8512"]
+    fn vcvtph2bf8512(a: f16x32, src: u8x32, mask: u32) -> u8x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2bf8s128"]
+    fn vcvtph2bf8s128(a: f16x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2bf8s256"]
+    fn vcvtph2bf8s256(a: f16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2bf8s512"]
+    fn vcvtph2bf8s512(a: f16x32, src: u8x32, mask: u32) -> u8x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2hf8128"]
+    fn vcvtph2hf8128(a: f16x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2hf8256"]
+    fn vcvtph2hf8256(a: f16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2hf8512"]
+    fn vcvtph2hf8512(a: f16x32, src: u8x32, mask: u32) -> u8x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2hf8s128"]
+    fn vcvtph2hf8s128(a: f16x8, src: u8x16, mask: u8) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2hf8s256"]
+    fn vcvtph2hf8s256(a: f16x16, src: u8x16, mask: u16) -> u8x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvtph2hf8s512"]
+    fn vcvtph2hf8s512(a: f16x32, src: u8x32, mask: u32) -> u8x32;
+
+    #[link_name = "llvm.x86.avx10.mask.vcvthf82ph128"]
+    fn vcvthf82ph128(a: u8x16, src: f16x8, mask: u8) -> f16x8;
+    #[link_name = "llvm.x86.avx10.mask.vcvthf82ph256"]
+    fn vcvthf82ph256(a: u8x16, src: f16x16, mask: u16) -> f16x16;
+    #[link_name = "llvm.x86.avx10.mask.vcvthf82ph512"]
+    fn vcvthf82ph512(a: u8x32, src: f16x32, mask: u32) -> f16x32;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_move_epi32() {
+        let a = _mm_set_epi32(0x12345678, 0x7ABCDEF0, 0x11111111, 0x22222222);
+        let r = _mm_move_epi32(a);
+        let e = _mm_set_epi32(0, 0, 0, 0x22222222);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_move_epi16() {
+        let a = _mm_set_epi16(
+            0x1234, 0x5678, 0x7ABC, 0x5EF0, 0x1111, 0x2222, 0x3333, 0x4444,
+        );
+        let r = _mm_move_epi16(a);
+        let e = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0x4444);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_mpsadbw_epu8() {
+        let src = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(5);
+        let k = 0b01010101;
+        let r = _mm_mask_mpsadbw_epu8::<0>(src, k, a, b);
+        // Each SAD result is: abs(10-5) * 4 = 20
+        let e = _mm_setr_epi16(20, 2, 20, 4, 20, 6, 20, 8);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_mpsadbw_epu8() {
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(5);
+        let k = 0b01010101;
+        let r = _mm_maskz_mpsadbw_epu8::<0>(k, a, b);
+        let e = _mm_setr_epi16(20, 0, 20, 0, 20, 0, 20, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_mpsadbw_epu8() {
+        let src = _mm256_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(5);
+        let k = 0b0101010101010101;
+        let r = _mm256_mask_mpsadbw_epu8::<0>(src, k, a, b);
+        let e = _mm256_setr_epi16(20, 2, 20, 4, 20, 6, 20, 8, 20, 10, 20, 12, 20, 14, 20, 16);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_mpsadbw_epu8() {
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(5);
+        let k = 0b0101010101010101;
+        let r = _mm256_maskz_mpsadbw_epu8::<0>(k, a, b);
+        let e = _mm256_setr_epi16(20, 0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mpsadbw_epu8() {
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(5);
+        let r = _mm512_mpsadbw_epu8::<0>(a, b);
+        let e = _mm512_set1_epi16(20);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_mpsadbw_epu8() {
+        let src = _mm512_set_epi16(
+            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11,
+            10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+        );
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(5);
+        let k = 0b01010101010101010101010101010101;
+        let r = _mm512_mask_mpsadbw_epu8::<0>(src, k, a, b);
+        let e = _mm512_set_epi16(
+            32, 20, 30, 20, 28, 20, 26, 20, 24, 20, 22, 20, 20, 20, 18, 20, 16, 20, 14, 20, 12, 20,
+            10, 20, 8, 20, 6, 20, 4, 20, 2, 20,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_mpsadbw_epu8() {
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(5);
+        let k = 0b01010101010101010101010101010101;
+        let r = _mm512_maskz_mpsadbw_epu8::<0>(k, a, b);
+        let e = _mm512_set_epi16(
+            0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0, 20, 0,
+            20, 0, 20, 0, 20, 0, 20,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_dpph_ps() {
+        let src = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let a = _mm_set1_ph(10.0);
+        let b = _mm_set1_ph(20.0);
+        let r = _mm_dpph_ps(src, a, b);
+        // Each result is: src[i] + (a[2*i] * b[2*i] + a[2*i+1] * b[2*i+1])
+        let e = _mm_setr_ps(401.0, 402.0, 403.0, 404.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_dpph_ps() {
+        let src = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let a = _mm_set1_ph(10.0);
+        let b = _mm_set1_ph(20.0);
+        let k = 0b0101;
+        let r = _mm_mask_dpph_ps(src, k, a, b);
+        let e = _mm_setr_ps(401.0, 2.0, 403.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_dpph_ps() {
+        let src = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let a = _mm_set1_ph(10.0);
+        let b = _mm_set1_ph(20.0);
+        let k = 0b0101;
+        let r = _mm_maskz_dpph_ps(k, src, a, b);
+        let e = _mm_setr_ps(401.0, 0.0, 403.0, 0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_dpph_ps() {
+        let src = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let a = _mm256_set1_ph(10.0);
+        let b = _mm256_set1_ph(20.0);
+        let r = _mm256_dpph_ps(src, a, b);
+        let e = _mm256_setr_ps(401.0, 402.0, 403.0, 404.0, 405.0, 406.0, 407.0, 408.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_dpph_ps() {
+        let src = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let a = _mm256_set1_ph(10.0);
+        let b = _mm256_set1_ph(20.0);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpph_ps(src, k, a, b);
+        let e = _mm256_setr_ps(401.0, 2.0, 403.0, 4.0, 405.0, 6.0, 407.0, 8.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_dpph_ps() {
+        let src = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let a = _mm256_set1_ph(10.0);
+        let b = _mm256_set1_ph(20.0);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpph_ps(k, src, a, b);
+        let e = _mm256_setr_ps(401.0, 0.0, 403.0, 0.0, 405.0, 0.0, 407.0, 0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpph_ps() {
+        let src = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let a = _mm512_set1_ph(10.0);
+        let b = _mm512_set1_ph(20.0);
+        let r = _mm512_dpph_ps(src, a, b);
+        let e = _mm512_setr_ps(
+            401.0, 402.0, 403.0, 404.0, 405.0, 406.0, 407.0, 408.0, 409.0, 410.0, 411.0, 412.0,
+            413.0, 414.0, 415.0, 416.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpph_ps() {
+        let src = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let a = _mm512_set1_ph(10.0);
+        let b = _mm512_set1_ph(20.0);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpph_ps(src, k, a, b);
+        let e = _mm512_setr_ps(
+            401.0, 2.0, 403.0, 4.0, 405.0, 6.0, 407.0, 8.0, 409.0, 10.0, 411.0, 12.0, 413.0, 14.0,
+            415.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpph_ps() {
+        let src = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let a = _mm512_set1_ph(10.0);
+        let b = _mm512_set1_ph(20.0);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpph_ps(k, src, a, b);
+        let e = _mm512_setr_ps(
+            401.0, 0.0, 403.0, 0.0, 405.0, 0.0, 407.0, 0.0, 409.0, 0.0, 411.0, 0.0, 413.0, 0.0,
+            415.0, 0.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_mask_dpbssd_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpbssd_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(801, 2, 803, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_maskz_dpbssd_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpbssd_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(801, 0, 803, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_mask_dpbssd_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpbssd_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(801, 2, 803, 4, 805, 6, 807, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_maskz_dpbssd_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpbssd_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(801, 0, 803, 0, 805, 0, 807, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpbssd_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let r = _mm512_dpbssd_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpbssd_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpbssd_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            801, 2, 803, 4, 805, 6, 807, 8, 809, 10, 811, 12, 813, 14, 815, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpbssd_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpbssd_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 0, 803, 0, 805, 0, 807, 0, 809, 0, 811, 0, 813, 0, 815, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_mask_dpbssds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpbssds_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(801, 2, 803, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_maskz_dpbssds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpbssds_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(801, 0, 803, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_mask_dpbssds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpbssds_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(801, 2, 803, 4, 805, 6, 807, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_maskz_dpbssds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpbssds_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(801, 0, 803, 0, 805, 0, 807, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpbssds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let r = _mm512_dpbssds_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpbssds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpbssds_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            801, 2, 803, 4, 805, 6, 807, 8, 809, 10, 811, 12, 813, 14, 815, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpbssds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpbssds_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 0, 803, 0, 805, 0, 807, 0, 809, 0, 811, 0, 813, 0, 815, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_mask_dpbsud_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpbsud_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(801, 2, 803, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_maskz_dpbsud_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpbsud_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(801, 0, 803, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_mask_dpbsud_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpbsud_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(801, 2, 803, 4, 805, 6, 807, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_maskz_dpbsud_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpbsud_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(801, 0, 803, 0, 805, 0, 807, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpbsud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let r = _mm512_dpbsud_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpbsud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpbsud_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            801, 2, 803, 4, 805, 6, 807, 8, 809, 10, 811, 12, 813, 14, 815, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpbsud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpbsud_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 0, 803, 0, 805, 0, 807, 0, 809, 0, 811, 0, 813, 0, 815, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_mask_dpbsuds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpbsuds_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(801, 2, 803, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_maskz_dpbsuds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpbsuds_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(801, 0, 803, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_mask_dpbsuds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpbsuds_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(801, 2, 803, 4, 805, 6, 807, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_maskz_dpbsuds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpbsuds_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(801, 0, 803, 0, 805, 0, 807, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpbsuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let r = _mm512_dpbsuds_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpbsuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpbsuds_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            801, 2, 803, 4, 805, 6, 807, 8, 809, 10, 811, 12, 813, 14, 815, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpbsuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpbsuds_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 0, 803, 0, 805, 0, 807, 0, 809, 0, 811, 0, 813, 0, 815, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_mask_dpbuud_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpbuud_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(801, 2, 803, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_maskz_dpbuud_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpbuud_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(801, 0, 803, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_mask_dpbuud_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpbuud_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(801, 2, 803, 4, 805, 6, 807, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_maskz_dpbuud_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpbuud_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(801, 0, 803, 0, 805, 0, 807, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpbuud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let r = _mm512_dpbuud_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpbuud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpbuud_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            801, 2, 803, 4, 805, 6, 807, 8, 809, 10, 811, 12, 813, 14, 815, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpbuud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpbuud_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 0, 803, 0, 805, 0, 807, 0, 809, 0, 811, 0, 813, 0, 815, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_mask_dpbuuds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpbuuds_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(801, 2, 803, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm_maskz_dpbuuds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi8(10);
+        let b = _mm_set1_epi8(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpbuuds_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(801, 0, 803, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_mask_dpbuuds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpbuuds_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(801, 2, 803, 4, 805, 6, 807, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint8")]
+    fn test_mm256_maskz_dpbuuds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi8(10);
+        let b = _mm256_set1_epi8(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpbuuds_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(801, 0, 803, 0, 805, 0, 807, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpbuuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let r = _mm512_dpbuuds_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpbuuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpbuuds_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            801, 2, 803, 4, 805, 6, 807, 8, 809, 10, 811, 12, 813, 14, 815, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpbuuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi8(10);
+        let b = _mm512_set1_epi8(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpbuuds_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            801, 0, 803, 0, 805, 0, 807, 0, 809, 0, 811, 0, 813, 0, 815, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_mask_dpwsud_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpwsud_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(401, 2, 403, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_maskz_dpwsud_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpwsud_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(401, 0, 403, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_mask_dpwsud_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpwsud_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(401, 2, 403, 4, 405, 6, 407, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_maskz_dpwsud_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpwsud_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(401, 0, 403, 0, 405, 0, 407, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpwsud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let r = _mm512_dpwsud_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpwsud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpwsud_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            401, 2, 403, 4, 405, 6, 407, 8, 409, 10, 411, 12, 413, 14, 415, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpwsud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpwsud_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 0, 403, 0, 405, 0, 407, 0, 409, 0, 411, 0, 413, 0, 415, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_mask_dpwsuds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpwsuds_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(401, 2, 403, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_maskz_dpwsuds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpwsuds_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(401, 0, 403, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_mask_dpwsuds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpwsuds_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(401, 2, 403, 4, 405, 6, 407, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_maskz_dpwsuds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpwsuds_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(401, 0, 403, 0, 405, 0, 407, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpwsuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let r = _mm512_dpwsuds_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpwsuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpwsuds_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            401, 2, 403, 4, 405, 6, 407, 8, 409, 10, 411, 12, 413, 14, 415, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpwsuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpwsuds_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 0, 403, 0, 405, 0, 407, 0, 409, 0, 411, 0, 413, 0, 415, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_mask_dpwusd_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpwusd_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(401, 2, 403, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_maskz_dpwusd_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpwusd_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(401, 0, 403, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_mask_dpwusd_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpwusd_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(401, 2, 403, 4, 405, 6, 407, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_maskz_dpwusd_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpwusd_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(401, 0, 403, 0, 405, 0, 407, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpwusd_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let r = _mm512_dpwusd_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpwusd_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpwusd_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            401, 2, 403, 4, 405, 6, 407, 8, 409, 10, 411, 12, 413, 14, 415, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpwusd_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpwusd_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 0, 403, 0, 405, 0, 407, 0, 409, 0, 411, 0, 413, 0, 415, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_mask_dpwusds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpwusds_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(401, 2, 403, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_maskz_dpwusds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpwusds_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(401, 0, 403, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_mask_dpwusds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpwusds_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(401, 2, 403, 4, 405, 6, 407, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_maskz_dpwusds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpwusds_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(401, 0, 403, 0, 405, 0, 407, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpwusds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let r = _mm512_dpwusds_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpwusds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpwusds_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            401, 2, 403, 4, 405, 6, 407, 8, 409, 10, 411, 12, 413, 14, 415, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpwusds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpwusds_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 0, 403, 0, 405, 0, 407, 0, 409, 0, 411, 0, 413, 0, 415, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_mask_dpwuud_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpwuud_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(401, 2, 403, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_maskz_dpwuud_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpwuud_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(401, 0, 403, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_mask_dpwuud_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpwuud_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(401, 2, 403, 4, 405, 6, 407, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_maskz_dpwuud_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpwuud_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(401, 0, 403, 0, 405, 0, 407, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpwuud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let r = _mm512_dpwuud_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpwuud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpwuud_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            401, 2, 403, 4, 405, 6, 407, 8, 409, 10, 411, 12, 413, 14, 415, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpwuud_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpwuud_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 0, 403, 0, 405, 0, 407, 0, 409, 0, 411, 0, 413, 0, 415, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_mask_dpwuuds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_mask_dpwuuds_epi32(src, k, a, b);
+        let e = _mm_setr_epi32(401, 2, 403, 4);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm_maskz_dpwuuds_epi32() {
+        let src = _mm_setr_epi32(1, 2, 3, 4);
+        let a = _mm_set1_epi16(10);
+        let b = _mm_set1_epi16(20);
+        let k = 0b0101;
+        let r = _mm_maskz_dpwuuds_epi32(k, src, a, b);
+        let e = _mm_setr_epi32(401, 0, 403, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_mask_dpwuuds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_mask_dpwuuds_epi32(src, k, a, b);
+        let e = _mm256_setr_epi32(401, 2, 403, 4, 405, 6, 407, 8);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2,avxvnniint16")]
+    fn test_mm256_maskz_dpwuuds_epi32() {
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let a = _mm256_set1_epi16(10);
+        let b = _mm256_set1_epi16(20);
+        let k = 0b01010101;
+        let r = _mm256_maskz_dpwuuds_epi32(k, src, a, b);
+        let e = _mm256_setr_epi32(401, 0, 403, 0, 405, 0, 407, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_dpwuuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let r = _mm512_dpwuuds_epi32(src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_dpwuuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_mask_dpwuuds_epi32(src, k, a, b);
+        let e = _mm512_setr_epi32(
+            401, 2, 403, 4, 405, 6, 407, 8, 409, 10, 411, 12, 413, 14, 415, 16,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_dpwuuds_epi32() {
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let a = _mm512_set1_epi16(10);
+        let b = _mm512_set1_epi16(20);
+        let k = 0b0101010101010101;
+        let r = _mm512_maskz_dpwuuds_epi32(k, src, a, b);
+        let e = _mm512_setr_epi32(
+            401, 0, 403, 0, 405, 0, 407, 0, 409, 0, 411, 0, 413, 0, 415, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundsd_i32() {
+        let a = _mm_set_sd(2.0);
+        let r = _mm_cvtts_roundsd_i32::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 2i32);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundsd_si32() {
+        let a = _mm_set_sd(3.7);
+        let r = _mm_cvtts_roundsd_si32::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 3i32);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundsd_u32() {
+        let a = _mm_set_sd(5.9);
+        let r = _mm_cvtts_roundsd_u32::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 5u32);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundss_i32() {
+        let a = _mm_set_ss(4.2);
+        let r = _mm_cvtts_roundss_i32::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 4i32);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundss_si32() {
+        let a = _mm_set_ss(6.8);
+        let r = _mm_cvtts_roundss_si32::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 6i32);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundss_u32() {
+        let a = _mm_set_ss(7.1);
+        let r = _mm_cvtts_roundss_u32::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 7u32);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_pd_epi32() {
+        let a = _mm_setr_pd(14.2, 15.8);
+        let r = _mm_cvtts_pd_epi32(a);
+        let expected = _mm_setr_epi32(14, 15, 0, 0);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtts_pd_epi32() {
+        let a = _mm_setr_pd(16.9, 17.1);
+        let src = _mm_setr_epi32(100, 200, 0, 0);
+        let r = _mm_mask_cvtts_pd_epi32(src, 0b01, a);
+        let expected = _mm_setr_epi32(16, 200, 0, 0);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtts_pd_epi32() {
+        let a = _mm_setr_pd(18.5, 19.3);
+        let r = _mm_maskz_cvtts_pd_epi32(0b10, a);
+        let expected = _mm_setr_epi32(0, 19, 0, 0);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtts_pd_epi32() {
+        let a = _mm256_setr_pd(20.7, 21.4, 22.9, 23.1);
+        let r = _mm256_cvtts_pd_epi32(a);
+        let expected = _mm_setr_epi32(20, 21, 22, 23);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtts_pd_epi32() {
+        let a = _mm256_setr_pd(24.8, 25.2, 26.6, 27.3);
+        let src = _mm_setr_epi32(100, 200, 300, 400);
+        let r = _mm256_mask_cvtts_pd_epi32(src, 0b0101, a);
+        let expected = _mm_setr_epi32(24, 200, 26, 400);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtts_pd_epi32() {
+        let a = _mm256_setr_pd(28.9, 29.1, 30.4, 31.7);
+        let r = _mm256_maskz_cvtts_pd_epi32(0b1010, a);
+        let expected = _mm_setr_epi32(0, 29, 0, 31);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_pd_epi32() {
+        let a = _mm512_setr_pd(32.5, 33.2, 34.8, 35.3, 36.6, 37.9, 38.1, 39.4);
+        let r = _mm512_cvtts_pd_epi32(a);
+        let expected = _mm256_setr_epi32(32, 33, 34, 35, 36, 37, 38, 39);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_pd_epi32() {
+        let a = _mm512_setr_pd(40.7, 41.3, 42.9, 43.2, 44.5, 45.8, 46.1, 47.6);
+        let src = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_pd_epi32(src, 0b10101010, a);
+        let expected = _mm256_setr_epi32(100, 41, 300, 43, 500, 45, 700, 47);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_pd_epi32() {
+        let a = _mm512_setr_pd(48.4, 49.8, 50.3, 51.7, 52.1, 53.9, 54.2, 55.6);
+        let r = _mm512_maskz_cvtts_pd_epi32(0b11110000, a);
+        let expected = _mm256_setr_epi32(0, 0, 0, 0, 52, 53, 54, 55);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_roundpd_epi32() {
+        let a = _mm512_setr_pd(56.5, 57.2, 58.8, 59.3, 60.6, 61.9, 62.1, 63.4);
+        let r = _mm512_cvtts_roundpd_epi32::<_MM_FROUND_NO_EXC>(a);
+        let expected = _mm256_setr_epi32(56, 57, 58, 59, 60, 61, 62, 63);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_roundpd_epi32() {
+        let a = _mm512_setr_pd(64.7, 65.3, 66.9, 67.2, 68.5, 69.8, 70.1, 71.6);
+        let src = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_roundpd_epi32::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let expected = _mm256_setr_epi32(64, 200, 66, 400, 68, 600, 70, 800);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_roundpd_epi32() {
+        let a = _mm512_setr_pd(72.4, 73.8, 74.3, 75.7, 76.1, 77.9, 78.2, 79.6);
+        let r = _mm512_maskz_cvtts_roundpd_epi32::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let expected = _mm256_setr_epi32(72, 73, 74, 75, 0, 0, 0, 0);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_pd_epu32() {
+        let a = _mm_setr_pd(14.2, 15.8);
+        let r = _mm_cvtts_pd_epu32(a);
+        let expected = _mm_setr_epi32(14, 15, 0, 0);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtts_pd_epu32() {
+        let a = _mm_setr_pd(16.9, 17.1);
+        let src = _mm_setr_epi32(100, 200, 0, 0);
+        let r = _mm_mask_cvtts_pd_epu32(src, 0b01, a);
+        let expected = _mm_setr_epi32(16, 200, 0, 0);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtts_pd_epu32() {
+        let a = _mm_setr_pd(18.5, 19.3);
+        let r = _mm_maskz_cvtts_pd_epu32(0b10, a);
+        let expected = _mm_setr_epi32(0, 19, 0, 0);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtts_pd_epu32() {
+        let a = _mm256_setr_pd(20.7, 21.4, 22.9, 23.1);
+        let r = _mm256_cvtts_pd_epu32(a);
+        let expected = _mm_setr_epi32(20, 21, 22, 23);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtts_pd_epu32() {
+        let a = _mm256_setr_pd(24.8, 25.2, 26.6, 27.3);
+        let src = _mm_setr_epi32(100, 200, 300, 400);
+        let r = _mm256_mask_cvtts_pd_epu32(src, 0b0101, a);
+        let expected = _mm_setr_epi32(24, 200, 26, 400);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtts_pd_epu32() {
+        let a = _mm256_setr_pd(28.9, 29.1, 30.4, 31.7);
+        let r = _mm256_maskz_cvtts_pd_epu32(0b1010, a);
+        let expected = _mm_setr_epi32(0, 29, 0, 31);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_pd_epu32() {
+        let a = _mm512_setr_pd(32.5, 33.2, 34.8, 35.3, 36.6, 37.9, 38.1, 39.4);
+        let r = _mm512_cvtts_pd_epu32(a);
+        let expected = _mm256_setr_epi32(32, 33, 34, 35, 36, 37, 38, 39);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_pd_epu32() {
+        let a = _mm512_setr_pd(40.7, 41.3, 42.9, 43.2, 44.5, 45.8, 46.1, 47.6);
+        let src = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_pd_epu32(src, 0b10101010, a);
+        let expected = _mm256_setr_epi32(100, 41, 300, 43, 500, 45, 700, 47);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_pd_epu32() {
+        let a = _mm512_setr_pd(48.4, 49.8, 50.3, 51.7, 52.1, 53.9, 54.2, 55.6);
+        let r = _mm512_maskz_cvtts_pd_epu32(0b11110000, a);
+        let expected = _mm256_setr_epi32(0, 0, 0, 0, 52, 53, 54, 55);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_roundpd_epu32() {
+        let a = _mm512_setr_pd(56.5, 57.2, 58.8, 59.3, 60.6, 61.9, 62.1, 63.4);
+        let r = _mm512_cvtts_roundpd_epu32::<_MM_FROUND_NO_EXC>(a);
+        let expected = _mm256_setr_epi32(56, 57, 58, 59, 60, 61, 62, 63);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_roundpd_epu32() {
+        let a = _mm512_setr_pd(64.7, 65.3, 66.9, 67.2, 68.5, 69.8, 70.1, 71.6);
+        let src = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_roundpd_epu32::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let expected = _mm256_setr_epi32(64, 200, 66, 400, 68, 600, 70, 800);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_roundpd_epu32() {
+        let a = _mm512_setr_pd(72.4, 73.8, 74.3, 75.7, 76.1, 77.9, 78.2, 79.6);
+        let r = _mm512_maskz_cvtts_roundpd_epu32::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let expected = _mm256_setr_epi32(72, 73, 74, 75, 0, 0, 0, 0);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_pd_epi64() {
+        let a = _mm_setr_pd(80.5, 81.9);
+        let r = _mm_cvtts_pd_epi64(a);
+        let expected = _mm_setr_epi64x(80, 81);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtts_pd_epi64() {
+        let a = _mm_setr_pd(82.3, 83.7);
+        let src = _mm_setr_epi64x(100, 200);
+        let r = _mm_mask_cvtts_pd_epi64(src, 0b01, a);
+        let expected = _mm_setr_epi64x(82, 200);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtts_pd_epi64() {
+        let a = _mm_setr_pd(84.8, 85.2);
+        let r = _mm_maskz_cvtts_pd_epi64(0b10, a);
+        let expected = _mm_setr_epi64x(0, 85);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtts_pd_epi64() {
+        let a = _mm256_setr_pd(86.4, 87.6, 88.1, 89.9);
+        let r = _mm256_cvtts_pd_epi64(a);
+        let expected = _mm256_setr_epi64x(86, 87, 88, 89);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtts_pd_epi64() {
+        let a = _mm256_setr_pd(90.7, 91.3, 92.8, 93.2);
+        let src = _mm256_setr_epi64x(100, 200, 300, 400);
+        let r = _mm256_mask_cvtts_pd_epi64(src, 0b0101, a);
+        let expected = _mm256_setr_epi64x(90, 200, 92, 400);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtts_pd_epi64() {
+        let a = _mm256_setr_pd(94.5, 95.1, 96.9, 97.4);
+        let r = _mm256_maskz_cvtts_pd_epi64(0b1010, a);
+        let expected = _mm256_setr_epi64x(0, 95, 0, 97);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_pd_epi64() {
+        let a = _mm512_setr_pd(98.6, 99.2, 100.8, 101.3, 102.7, 103.9, 104.1, 105.5);
+        let r = _mm512_cvtts_pd_epi64(a);
+        let expected = _mm512_setr_epi64(98, 99, 100, 101, 102, 103, 104, 105);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_pd_epi64() {
+        let a = _mm512_setr_pd(106.4, 107.8, 108.2, 109.6, 110.3, 111.7, 112.9, 113.1);
+        let src = _mm512_setr_epi64(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_pd_epi64(src, 0b10101010, a);
+        let expected = _mm512_setr_epi64(100, 107, 300, 109, 500, 111, 700, 113);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_pd_epi64() {
+        let a = _mm512_setr_pd(114.5, 115.8, 116.3, 117.7, 118.2, 119.9, 120.4, 121.6);
+        let r = _mm512_maskz_cvtts_pd_epi64(0b11110000, a);
+        let expected = _mm512_setr_epi64(0, 0, 0, 0, 118, 119, 120, 121);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_roundpd_epi64() {
+        let a = _mm512_setr_pd(122.7, 123.3, 124.9, 125.1, 126.5, 127.8, 128.2, 129.6);
+        let r = _mm512_cvtts_roundpd_epi64::<_MM_FROUND_NO_EXC>(a);
+        let expected = _mm512_setr_epi64(122, 123, 124, 125, 126, 127, 128, 129);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_roundpd_epi64() {
+        let a = _mm512_setr_pd(130.4, 131.8, 132.3, 133.7, 134.1, 135.9, 136.5, 137.2);
+        let src = _mm512_setr_epi64(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_roundpd_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let expected = _mm512_setr_epi64(130, 200, 132, 400, 134, 600, 136, 800);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_roundpd_epi64() {
+        let a = _mm512_setr_pd(138.6, 139.4, 140.8, 141.2, 142.7, 143.9, 144.3, 145.5);
+        let r = _mm512_maskz_cvtts_roundpd_epi64::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let expected = _mm512_setr_epi64(138, 139, 140, 141, 0, 0, 0, 0);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_pd_epu64() {
+        let a = _mm_setr_pd(146.7, 147.9);
+        let r = _mm_cvtts_pd_epu64(a);
+        let expected = _mm_setr_epi64x(146, 147);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtts_pd_epu64() {
+        let a = _mm_setr_pd(148.3, 149.6);
+        let src = _mm_setr_epi64x(100, 200);
+        let r = _mm_mask_cvtts_pd_epu64(src, 0b01, a);
+        let expected = _mm_setr_epi64x(148, 200);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtts_pd_epu64() {
+        let a = _mm_setr_pd(150.8, 151.2);
+        let r = _mm_maskz_cvtts_pd_epu64(0b10, a);
+        let expected = _mm_setr_epi64x(0, 151);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtts_pd_epu64() {
+        let a = _mm256_setr_pd(152.4, 153.7, 154.1, 155.9);
+        let r = _mm256_cvtts_pd_epu64(a);
+        let expected = _mm256_setr_epi64x(152, 153, 154, 155);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtts_pd_epu64() {
+        let a = _mm256_setr_pd(156.5, 157.3, 158.8, 159.2);
+        let src = _mm256_setr_epi64x(100, 200, 300, 400);
+        let r = _mm256_mask_cvtts_pd_epu64(src, 0b0101, a);
+        let expected = _mm256_setr_epi64x(156, 200, 158, 400);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtts_pd_epu64() {
+        let a = _mm256_setr_pd(160.6, 161.1, 162.9, 163.4);
+        let r = _mm256_maskz_cvtts_pd_epu64(0b1010, a);
+        let expected = _mm256_setr_epi64x(0, 161, 0, 163);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_pd_epu64() {
+        let a = _mm512_setr_pd(164.7, 165.2, 166.8, 167.3, 168.5, 169.9, 170.1, 171.6);
+        let r = _mm512_cvtts_pd_epu64(a);
+        let expected = _mm512_setr_epi64(164, 165, 166, 167, 168, 169, 170, 171);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_pd_epu64() {
+        let a = _mm512_setr_pd(172.4, 173.8, 174.2, 175.6, 176.3, 177.7, 178.9, 179.1);
+        let src = _mm512_setr_epi64(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_pd_epu64(src, 0b10101010, a);
+        let expected = _mm512_setr_epi64(100, 173, 300, 175, 500, 177, 700, 179);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_pd_epu64() {
+        let a = _mm512_setr_pd(180.5, 181.8, 182.3, 183.7, 184.2, 185.9, 186.4, 187.6);
+        let r = _mm512_maskz_cvtts_pd_epu64(0b11110000, a);
+        let expected = _mm512_setr_epi64(0, 0, 0, 0, 184, 185, 186, 187);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_roundpd_epu64() {
+        let a = _mm512_setr_pd(188.7, 189.3, 190.9, 191.1, 192.5, 193.8, 194.2, 195.6);
+        let r = _mm512_cvtts_roundpd_epu64::<_MM_FROUND_NO_EXC>(a);
+        let expected = _mm512_setr_epi64(188, 189, 190, 191, 192, 193, 194, 195);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_roundpd_epu64() {
+        let a = _mm512_setr_pd(196.4, 197.8, 198.3, 199.7, 200.1, 201.9, 202.5, 203.2);
+        let src = _mm512_setr_epi64(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_roundpd_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let expected = _mm512_setr_epi64(196, 200, 198, 400, 200, 600, 202, 800);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_roundpd_epu64() {
+        let a = _mm512_setr_pd(204.6, 205.4, 206.8, 207.2, 208.7, 209.9, 210.3, 211.5);
+        let r = _mm512_maskz_cvtts_roundpd_epu64::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let expected = _mm512_setr_epi64(204, 205, 206, 207, 0, 0, 0, 0);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_ps_epi32() {
+        let a = _mm_setr_ps(212.3, 213.7, 214.9, 215.2);
+        let r = _mm_cvtts_ps_epi32(a);
+        let expected = _mm_setr_epi32(212, 213, 214, 215);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtts_ps_epi32() {
+        let a = _mm_setr_ps(216.5, 217.8, 218.1, 219.6);
+        let src = _mm_setr_epi32(100, 200, 300, 400);
+        let r = _mm_mask_cvtts_ps_epi32(src, 0b0101, a);
+        let expected = _mm_setr_epi32(216, 200, 218, 400);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtts_ps_epi32() {
+        let a = _mm_setr_ps(220.4, 221.9, 222.3, 223.7);
+        let r = _mm_maskz_cvtts_ps_epi32(0b1010, a);
+        let expected = _mm_setr_epi32(0, 221, 0, 223);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtts_ps_epi32() {
+        let a = _mm256_setr_ps(224.6, 225.2, 226.8, 227.3, 228.5, 229.9, 230.1, 231.7);
+        let r = _mm256_cvtts_ps_epi32(a);
+        let expected = _mm256_setr_epi32(224, 225, 226, 227, 228, 229, 230, 231);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtts_ps_epi32() {
+        let a = _mm256_setr_ps(232.4, 233.8, 234.2, 235.6, 236.3, 237.7, 238.9, 239.1);
+        let src = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm256_mask_cvtts_ps_epi32(src, 0b10101010, a);
+        let expected = _mm256_setr_epi32(100, 233, 300, 235, 500, 237, 700, 239);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtts_ps_epi32() {
+        let a = _mm256_setr_ps(240.5, 241.8, 242.3, 243.7, 244.2, 245.9, 246.4, 247.6);
+        let r = _mm256_maskz_cvtts_ps_epi32(0b11110000, a);
+        let expected = _mm256_setr_epi32(0, 0, 0, 0, 244, 245, 246, 247);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_ps_epi32() {
+        let a = _mm512_setr_ps(
+            248.7, 249.3, 250.9, 251.1, 252.5, 253.8, 254.2, 255.6, 256.4, 257.8, 258.3, 259.7,
+            260.1, 261.9, 262.5, 263.2,
+        );
+        let r = _mm512_cvtts_ps_epi32(a);
+        let expected = _mm512_setr_epi32(
+            248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_ps_epi32() {
+        let a = _mm512_setr_ps(
+            264.6, 265.4, 266.8, 267.2, 268.7, 269.9, 270.3, 271.5, 272.1, 273.6, 274.9, 275.3,
+            276.8, 277.2, 278.5, 279.7,
+        );
+        let src = _mm512_setr_epi32(
+            100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600,
+        );
+        let r = _mm512_mask_cvtts_ps_epi32(src, 0b1010101010101010, a);
+        let expected = _mm512_setr_epi32(
+            100, 265, 300, 267, 500, 269, 700, 271, 900, 273, 1100, 275, 1300, 277, 1500, 279,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_ps_epi32() {
+        let a = _mm512_setr_ps(
+            280.4, 281.9, 282.3, 283.7, 284.1, 285.8, 286.5, 287.2, 288.6, 289.4, 290.8, 291.3,
+            292.7, 293.1, 294.9, 295.5,
+        );
+        let r = _mm512_maskz_cvtts_ps_epi32(0b1111111100000000, a);
+        let expected = _mm512_setr_epi32(
+            0, 0, 0, 0, 0, 0, 0, 0, 288, 289, 290, 291, 292, 293, 294, 295,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            296.7, 297.3, 298.9, 299.1, 300.5, 301.8, 302.2, 303.6, 304.4, 305.8, 306.3, 307.7,
+            308.1, 309.9, 310.5, 311.2,
+        );
+        let r = _mm512_cvtts_roundps_epi32::<_MM_FROUND_NO_EXC>(a);
+        let expected = _mm512_setr_epi32(
+            296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            312.6, 313.4, 314.8, 315.2, 316.7, 317.9, 318.3, 319.5, 320.1, 321.6, 322.9, 323.3,
+            324.8, 325.2, 326.5, 327.7,
+        );
+        let src = _mm512_setr_epi32(
+            100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600,
+        );
+        let r = _mm512_mask_cvtts_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
+        let expected = _mm512_setr_epi32(
+            312, 200, 314, 400, 316, 600, 318, 800, 320, 1000, 322, 1200, 324, 1400, 326, 1600,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_roundps_epi32() {
+        let a = _mm512_setr_ps(
+            328.4, 329.9, 330.3, 331.7, 332.1, 333.8, 334.5, 335.2, 336.6, 337.4, 338.8, 339.3,
+            340.7, 341.1, 342.9, 343.5,
+        );
+        let r = _mm512_maskz_cvtts_roundps_epi32::<_MM_FROUND_NO_EXC>(0b0000000011111111, a);
+        let expected = _mm512_setr_epi32(
+            328, 329, 330, 331, 332, 333, 334, 335, 0, 0, 0, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_ps_epu32() {
+        let a = _mm_setr_ps(344.7, 345.3, 346.9, 347.1);
+        let r = _mm_cvtts_ps_epu32(a);
+        let expected = _mm_setr_epi32(344, 345, 346, 347);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtts_ps_epu32() {
+        let a = _mm_setr_ps(348.5, 349.8, 350.2, 351.6);
+        let src = _mm_setr_epi32(100, 200, 300, 400);
+        let r = _mm_mask_cvtts_ps_epu32(src, 0b0101, a);
+        let expected = _mm_setr_epi32(348, 200, 350, 400);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtts_ps_epu32() {
+        let a = _mm_setr_ps(352.4, 353.9, 354.3, 355.7);
+        let r = _mm_maskz_cvtts_ps_epu32(0b1010, a);
+        let expected = _mm_setr_epi32(0, 353, 0, 355);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtts_ps_epu32() {
+        let a = _mm256_setr_ps(356.6, 357.2, 358.8, 359.3, 360.5, 361.9, 362.1, 363.7);
+        let r = _mm256_cvtts_ps_epu32(a);
+        let expected = _mm256_setr_epi32(356, 357, 358, 359, 360, 361, 362, 363);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtts_ps_epu32() {
+        let a = _mm256_setr_ps(364.4, 365.8, 366.2, 367.6, 368.3, 369.7, 370.9, 371.1);
+        let src = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm256_mask_cvtts_ps_epu32(src, 0b10101010, a);
+        let expected = _mm256_setr_epi32(100, 365, 300, 367, 500, 369, 700, 371);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtts_ps_epu32() {
+        let a = _mm256_setr_ps(372.5, 373.8, 374.3, 375.7, 376.2, 377.9, 378.4, 379.6);
+        let r = _mm256_maskz_cvtts_ps_epu32(0b11110000, a);
+        let expected = _mm256_setr_epi32(0, 0, 0, 0, 376, 377, 378, 379);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_ps_epu32() {
+        let a = _mm512_setr_ps(
+            380.7, 381.3, 382.9, 383.1, 384.5, 385.8, 386.2, 387.6, 388.4, 389.8, 390.3, 391.7,
+            392.1, 393.9, 394.5, 395.2,
+        );
+        let r = _mm512_cvtts_ps_epu32(a);
+        let expected = _mm512_setr_epi32(
+            380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_ps_epu32() {
+        let a = _mm512_setr_ps(
+            396.6, 397.4, 398.8, 399.2, 400.7, 401.9, 402.3, 403.5, 404.1, 405.6, 406.9, 407.3,
+            408.8, 409.2, 410.5, 411.7,
+        );
+        let src = _mm512_setr_epi32(
+            100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600,
+        );
+        let r = _mm512_mask_cvtts_ps_epu32(src, 0b1010101010101010, a);
+        let expected = _mm512_setr_epi32(
+            100, 397, 300, 399, 500, 401, 700, 403, 900, 405, 1100, 407, 1300, 409, 1500, 411,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_ps_epu32() {
+        let a = _mm512_setr_ps(
+            412.4, 413.9, 414.3, 415.7, 416.1, 417.8, 418.5, 419.2, 420.6, 421.4, 422.8, 423.3,
+            424.7, 425.1, 426.9, 427.5,
+        );
+        let r = _mm512_maskz_cvtts_ps_epu32(0b1111111100000000, a);
+        let expected = _mm512_setr_epi32(
+            0, 0, 0, 0, 0, 0, 0, 0, 420, 421, 422, 423, 424, 425, 426, 427,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            428.7, 429.3, 430.9, 431.1, 432.5, 433.8, 434.2, 435.6, 436.4, 437.8, 438.3, 439.7,
+            440.1, 441.9, 442.5, 443.2,
+        );
+        let r = _mm512_cvtts_roundps_epu32::<_MM_FROUND_NO_EXC>(a);
+        let expected = _mm512_setr_epi32(
+            428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            444.6, 445.4, 446.8, 447.2, 448.7, 449.9, 450.3, 451.5, 452.1, 453.6, 454.9, 455.3,
+            456.8, 457.2, 458.5, 459.7,
+        );
+        let src = _mm512_setr_epi32(
+            100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600,
+        );
+        let r = _mm512_mask_cvtts_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
+        let expected = _mm512_setr_epi32(
+            444, 200, 446, 400, 448, 600, 450, 800, 452, 1000, 454, 1200, 456, 1400, 458, 1600,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_roundps_epu32() {
+        let a = _mm512_setr_ps(
+            460.4, 461.9, 462.3, 463.7, 464.1, 465.8, 466.5, 467.2, 468.6, 469.4, 470.8, 471.3,
+            472.7, 473.1, 474.9, 475.5,
+        );
+        let r = _mm512_maskz_cvtts_roundps_epu32::<_MM_FROUND_NO_EXC>(0b0000000011111111, a);
+        let expected = _mm512_setr_epi32(
+            460, 461, 462, 463, 464, 465, 466, 467, 0, 0, 0, 0, 0, 0, 0, 0,
+        );
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_ps_epi64() {
+        let a = _mm_setr_ps(476.7, 477.3, 478.9, 479.1);
+        let r = _mm_cvtts_ps_epi64(a);
+        let expected = _mm_setr_epi64x(476, 477);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtts_ps_epi64() {
+        let a = _mm_setr_ps(480.5, 481.8, 482.2, 483.6);
+        let src = _mm_setr_epi64x(100, 200);
+        let r = _mm_mask_cvtts_ps_epi64(src, 0b01, a);
+        let expected = _mm_setr_epi64x(480, 200);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtts_ps_epi64() {
+        let a = _mm_setr_ps(484.4, 485.9, 486.3, 487.7);
+        let r = _mm_maskz_cvtts_ps_epi64(0b10, a);
+        let expected = _mm_setr_epi64x(0, 485);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtts_ps_epi64() {
+        let a = _mm_setr_ps(488.6, 489.2, 490.8, 491.3);
+        let r = _mm256_cvtts_ps_epi64(a);
+        let expected = _mm256_setr_epi64x(488, 489, 490, 491);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtts_ps_epi64() {
+        let a = _mm_setr_ps(496.4, 497.8, 498.2, 499.6);
+        let src = _mm256_setr_epi64x(100, 200, 300, 400);
+        let r = _mm256_mask_cvtts_ps_epi64(src, 0b0101, a);
+        let expected = _mm256_setr_epi64x(496, 200, 498, 400);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtts_ps_epi64() {
+        let a = _mm_setr_ps(504.5, 505.8, 506.3, 507.7);
+        let r = _mm256_maskz_cvtts_ps_epi64(0b1010, a);
+        let expected = _mm256_setr_epi64x(0, 505, 0, 507);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_ps_epi64() {
+        let a = _mm256_setr_ps(512.7, 513.3, 514.9, 515.1, 516.5, 517.8, 518.2, 519.6);
+        let r = _mm512_cvtts_ps_epi64(a);
+        let expected = _mm512_setr_epi64(512, 513, 514, 515, 516, 517, 518, 519);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_ps_epi64() {
+        let a = _mm256_setr_ps(520.4, 521.8, 522.3, 523.7, 524.1, 525.9, 526.5, 527.2);
+        let src = _mm512_setr_epi64(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_ps_epi64(src, 0b10101010, a);
+        let expected = _mm512_setr_epi64(100, 521, 300, 523, 500, 525, 700, 527);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_ps_epi64() {
+        let a = _mm256_setr_ps(528.6, 529.4, 530.8, 531.2, 532.7, 533.9, 534.3, 535.5);
+        let r = _mm512_maskz_cvtts_ps_epi64(0b11110000, a);
+        let expected = _mm512_setr_epi64(0, 0, 0, 0, 532, 533, 534, 535);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_roundps_epi64() {
+        let a = _mm256_setr_ps(536.7, 537.3, 538.9, 539.1, 540.5, 541.8, 542.2, 543.6);
+        let r = _mm512_cvtts_roundps_epi64::<_MM_FROUND_NO_EXC>(a);
+        let expected = _mm512_setr_epi64(536, 537, 538, 539, 540, 541, 542, 543);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_roundps_epi64() {
+        let a = _mm256_setr_ps(544.4, 545.8, 546.3, 547.7, 548.1, 549.9, 550.5, 551.2);
+        let src = _mm512_setr_epi64(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_roundps_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let expected = _mm512_setr_epi64(544, 200, 546, 400, 548, 600, 550, 800);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_roundps_epi64() {
+        let a = _mm256_setr_ps(552.6, 553.4, 554.8, 555.3, 556.7, 557.9, 558.2, 559.5);
+        let r = _mm512_maskz_cvtts_roundps_epi64::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let expected = _mm512_setr_epi64(552, 553, 554, 555, 0, 0, 0, 0);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_ps_epu64() {
+        let a = _mm_setr_ps(560.7, 561.3, 562.9, 563.1);
+        let r = _mm_cvtts_ps_epu64(a);
+        let expected = _mm_setr_epi64x(560, 561);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtts_ps_epu64() {
+        let a = _mm_setr_ps(564.5, 565.8, 566.2, 567.6);
+        let src = _mm_setr_epi64x(100, 200);
+        let r = _mm_mask_cvtts_ps_epu64(src, 0b01, a);
+        let expected = _mm_setr_epi64x(564, 200);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtts_ps_epu64() {
+        let a = _mm_setr_ps(568.4, 569.9, 570.3, 571.7);
+        let r = _mm_maskz_cvtts_ps_epu64(0b10, a);
+        let expected = _mm_setr_epi64x(0, 569);
+        assert_eq_m128i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtts_ps_epu64() {
+        let a = _mm_setr_ps(572.6, 573.2, 574.8, 575.3);
+        let r = _mm256_cvtts_ps_epu64(a);
+        let expected = _mm256_setr_epi64x(572, 573, 574, 575);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtts_ps_epu64() {
+        let a = _mm_setr_ps(580.4, 581.8, 582.2, 583.6);
+        let src = _mm256_setr_epi64x(100, 200, 300, 400);
+        let r = _mm256_mask_cvtts_ps_epu64(src, 0b0101, a);
+        let expected = _mm256_setr_epi64x(580, 200, 582, 400);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtts_ps_epu64() {
+        let a = _mm_setr_ps(588.5, 589.8, 590.3, 591.7);
+        let r = _mm256_maskz_cvtts_ps_epu64(0b1010, a);
+        let expected = _mm256_setr_epi64x(0, 589, 0, 591);
+        assert_eq_m256i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_ps_epu64() {
+        let a = _mm256_setr_ps(596.7, 597.3, 598.9, 599.1, 600.5, 601.8, 602.2, 603.6);
+        let r = _mm512_cvtts_ps_epu64(a);
+        let expected = _mm512_setr_epi64(596, 597, 598, 599, 600, 601, 602, 603);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_ps_epu64() {
+        let a = _mm256_setr_ps(604.4, 605.8, 606.3, 607.7, 608.1, 609.9, 610.5, 611.2);
+        let src = _mm512_setr_epi64(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_ps_epu64(src, 0b10101010, a);
+        let expected = _mm512_setr_epi64(100, 605, 300, 607, 500, 609, 700, 611);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_ps_epu64() {
+        let a = _mm256_setr_ps(612.6, 613.4, 614.8, 615.2, 616.7, 617.9, 618.3, 619.5);
+        let r = _mm512_maskz_cvtts_ps_epu64(0b11110000, a);
+        let expected = _mm512_setr_epi64(0, 0, 0, 0, 616, 617, 618, 619);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtts_roundps_epu64() {
+        let a = _mm256_setr_ps(620.7, 621.3, 622.9, 623.1, 624.5, 625.8, 626.2, 627.6);
+        let r = _mm512_cvtts_roundps_epu64::<_MM_FROUND_NO_EXC>(a);
+        let expected = _mm512_setr_epi64(620, 621, 622, 623, 624, 625, 626, 627);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtts_roundps_epu64() {
+        let a = _mm256_setr_ps(628.4, 629.8, 630.3, 631.7, 632.1, 633.9, 634.5, 635.2);
+        let src = _mm512_setr_epi64(100, 200, 300, 400, 500, 600, 700, 800);
+        let r = _mm512_mask_cvtts_roundps_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
+        let expected = _mm512_setr_epi64(628, 200, 630, 400, 632, 600, 634, 800);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtts_roundps_epu64() {
+        let a = _mm256_setr_ps(636.6, 637.4, 638.8, 639.3, 640.7, 641.9, 642.2, 643.5);
+        let r = _mm512_maskz_cvtts_roundps_epu64::<_MM_FROUND_NO_EXC>(0b00001111, a);
+        let expected = _mm512_setr_epi64(636, 637, 638, 639, 0, 0, 0, 0);
+        assert_eq_m512i(r, expected);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_minmax_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 0.0);
+
+        let r = _mm_minmax_pd::<0>(a, b);
+        let e = _mm_setr_pd(1.0, 0.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_minmax_pd::<1>(a, b);
+        let e = _mm_setr_pd(3.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_minmax_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 0.0);
+        let src = _mm_setr_pd(20.0, 30.0);
+
+        let r = _mm_mask_minmax_pd::<0>(src, 0b01, a, b);
+        let e = _mm_setr_pd(1.0, 30.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_mask_minmax_pd::<1>(src, 0b01, a, b);
+        let e = _mm_setr_pd(3.0, 30.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_minmax_pd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 0.0);
+
+        let r = _mm_maskz_minmax_pd::<0>(0b01, a, b);
+        let e = _mm_setr_pd(1.0, 0.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_maskz_minmax_pd::<1>(0b01, a, b);
+        let e = _mm_setr_pd(3.0, 0.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_minmax_pd() {
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let b = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm256_minmax_pd::<0>(a, b);
+        let e = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m256d(r, e);
+
+        let r = _mm256_minmax_pd::<1>(a, b);
+        let e = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_minmax_pd() {
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let b = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0);
+        let src = _mm256_setr_pd(20.0, 30.0, 40.0, 50.0);
+
+        let r = _mm256_mask_minmax_pd::<0>(src, 0b0101, a, b);
+        let e = _mm256_setr_pd(1.0, 30.0, 3.0, 50.0);
+        assert_eq_m256d(r, e);
+
+        let r = _mm256_mask_minmax_pd::<1>(src, 0b0101, a, b);
+        let e = _mm256_setr_pd(5.0, 30.0, 7.0, 50.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_minmax_pd() {
+        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
+        let b = _mm256_setr_pd(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm256_maskz_minmax_pd::<0>(0b0101, a, b);
+        let e = _mm256_setr_pd(1.0, 0.0, 3.0, 0.0);
+        assert_eq_m256d(r, e);
+
+        let r = _mm256_maskz_minmax_pd::<1>(0b0101, a, b);
+        let e = _mm256_setr_pd(5.0, 0.0, 7.0, 0.0);
+        assert_eq_m256d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_minmax_pd() {
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm512_setr_pd(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm512_minmax_pd::<0>(a, b);
+        let e = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+
+        let r = _mm512_minmax_pd::<1>(a, b);
+        let e = _mm512_setr_pd(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_minmax_pd() {
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm512_setr_pd(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let src = _mm512_setr_pd(20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0);
+
+        let r = _mm512_mask_minmax_pd::<0>(src, 0b01010101, a, b);
+        let e = _mm512_setr_pd(1.0, 30.0, 3.0, 50.0, 5.0, 70.0, 7.0, 90.0);
+        assert_eq_m512d(r, e);
+
+        let r = _mm512_mask_minmax_pd::<1>(src, 0b01010101, a, b);
+        let e = _mm512_setr_pd(9.0, 30.0, 11.0, 50.0, 13.0, 70.0, 15.0, 90.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_minmax_pd() {
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm512_setr_pd(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm512_maskz_minmax_pd::<0>(0b01010101, a, b);
+        let e = _mm512_setr_pd(1.0, 0.0, 3.0, 0.0, 5.0, 0.0, 7.0, 0.0);
+        assert_eq_m512d(r, e);
+
+        let r = _mm512_maskz_minmax_pd::<1>(0b01010101, a, b);
+        let e = _mm512_setr_pd(9.0, 0.0, 11.0, 0.0, 13.0, 0.0, 15.0, 0.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_minmax_round_pd() {
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm512_setr_pd(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm512_minmax_round_pd::<0, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m512d(r, e);
+
+        let r = _mm512_minmax_round_pd::<1, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm512_setr_pd(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_minmax_round_pd() {
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm512_setr_pd(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let src = _mm512_setr_pd(20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0);
+
+        let r = _mm512_mask_minmax_round_pd::<0, _MM_FROUND_NO_EXC>(src, 0b01010101, a, b);
+        let e = _mm512_setr_pd(1.0, 30.0, 3.0, 50.0, 5.0, 70.0, 7.0, 90.0);
+        assert_eq_m512d(r, e);
+
+        let r = _mm512_mask_minmax_round_pd::<1, _MM_FROUND_NO_EXC>(src, 0b01010101, a, b);
+        let e = _mm512_setr_pd(9.0, 30.0, 11.0, 50.0, 13.0, 70.0, 15.0, 90.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_minmax_round_pd() {
+        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm512_setr_pd(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm512_maskz_minmax_round_pd::<0, _MM_FROUND_NO_EXC>(0b01010101, a, b);
+        let e = _mm512_setr_pd(1.0, 0.0, 3.0, 0.0, 5.0, 0.0, 7.0, 0.0);
+        assert_eq_m512d(r, e);
+
+        let r = _mm512_maskz_minmax_round_pd::<1, _MM_FROUND_NO_EXC>(0b01010101, a, b);
+        let e = _mm512_setr_pd(9.0, 0.0, 11.0, 0.0, 13.0, 0.0, 15.0, 0.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_minmax_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm_minmax_ps::<0>(a, b);
+        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_minmax_ps::<1>(a, b);
+        let e = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_minmax_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let src = _mm_setr_ps(20.0, 30.0, 40.0, 50.0);
+
+        let r = _mm_mask_minmax_ps::<0>(src, 0b0101, a, b);
+        let e = _mm_setr_ps(1.0, 30.0, 3.0, 50.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_mask_minmax_ps::<1>(src, 0b0101, a, b);
+        let e = _mm_setr_ps(5.0, 30.0, 7.0, 50.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_minmax_ps() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm_maskz_minmax_ps::<0>(0b0101, a, b);
+        let e = _mm_setr_ps(1.0, 0.0, 3.0, 0.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_maskz_minmax_ps::<1>(0b0101, a, b);
+        let e = _mm_setr_ps(5.0, 0.0, 7.0, 0.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_minmax_ps() {
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm256_setr_ps(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm256_minmax_ps::<0>(a, b);
+        let e = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m256(r, e);
+
+        let r = _mm256_minmax_ps::<1>(a, b);
+        let e = _mm256_setr_ps(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_minmax_ps() {
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm256_setr_ps(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let src = _mm256_setr_ps(20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0);
+
+        let r = _mm256_mask_minmax_ps::<0>(src, 0b01010101, a, b);
+        let e = _mm256_setr_ps(1.0, 30.0, 3.0, 50.0, 5.0, 70.0, 7.0, 90.0);
+        assert_eq_m256(r, e);
+
+        let r = _mm256_mask_minmax_ps::<1>(src, 0b01010101, a, b);
+        let e = _mm256_setr_ps(9.0, 30.0, 11.0, 50.0, 13.0, 70.0, 15.0, 90.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_minmax_ps() {
+        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm256_setr_ps(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm256_maskz_minmax_ps::<0>(0b01010101, a, b);
+        let e = _mm256_setr_ps(1.0, 0.0, 3.0, 0.0, 5.0, 0.0, 7.0, 0.0);
+        assert_eq_m256(r, e);
+
+        let r = _mm256_maskz_minmax_ps::<1>(0b01010101, a, b);
+        let e = _mm256_setr_ps(9.0, 0.0, 11.0, 0.0, 13.0, 0.0, 15.0, 0.0);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_minmax_ps() {
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_setr_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+
+        let r = _mm512_minmax_ps::<0>(a, b);
+        let e = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+
+        let r = _mm512_minmax_ps::<1>(a, b);
+        let e = _mm512_setr_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_minmax_ps() {
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_setr_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_setr_ps(
+            20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0,
+            150.0, 160.0, 170.0,
+        );
+
+        let r = _mm512_mask_minmax_ps::<0>(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ps(
+            1.0, 30.0, 3.0, 50.0, 5.0, 70.0, 7.0, 90.0, 9.0, 110.0, 11.0, 130.0, 13.0, 150.0, 15.0,
+            170.0,
+        );
+        assert_eq_m512(r, e);
+
+        let r = _mm512_mask_minmax_ps::<1>(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ps(
+            17.0, 30.0, 19.0, 50.0, 21.0, 70.0, 23.0, 90.0, 25.0, 110.0, 27.0, 130.0, 29.0, 150.0,
+            31.0, 170.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_minmax_ps() {
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_setr_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+
+        let r = _mm512_maskz_minmax_ps::<0>(0b0101010101010101, a, b);
+        let e = _mm512_setr_ps(
+            1.0, 0.0, 3.0, 0.0, 5.0, 0.0, 7.0, 0.0, 9.0, 0.0, 11.0, 0.0, 13.0, 0.0, 15.0, 0.0,
+        );
+        assert_eq_m512(r, e);
+
+        let r = _mm512_maskz_minmax_ps::<1>(0b0101010101010101, a, b);
+        let e = _mm512_setr_ps(
+            17.0, 0.0, 19.0, 0.0, 21.0, 0.0, 23.0, 0.0, 25.0, 0.0, 27.0, 0.0, 29.0, 0.0, 31.0, 0.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_minmax_round_ps() {
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_setr_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+
+        let r = _mm512_minmax_round_ps::<0, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m512(r, e);
+
+        let r = _mm512_minmax_round_ps::<1, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm512_setr_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_minmax_round_ps() {
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_setr_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_setr_ps(
+            20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0,
+            150.0, 160.0, 170.0,
+        );
+
+        let r = _mm512_mask_minmax_round_ps::<0, _MM_FROUND_NO_EXC>(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ps(
+            1.0, 30.0, 3.0, 50.0, 5.0, 70.0, 7.0, 90.0, 9.0, 110.0, 11.0, 130.0, 13.0, 150.0, 15.0,
+            170.0,
+        );
+        assert_eq_m512(r, e);
+
+        let r = _mm512_mask_minmax_round_ps::<1, _MM_FROUND_NO_EXC>(src, 0b0101010101010101, a, b);
+        let e = _mm512_setr_ps(
+            17.0, 30.0, 19.0, 50.0, 21.0, 70.0, 23.0, 90.0, 25.0, 110.0, 27.0, 130.0, 29.0, 150.0,
+            31.0, 170.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_minmax_round_ps() {
+        let a = _mm512_setr_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_setr_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+
+        let r = _mm512_maskz_minmax_round_ps::<0, _MM_FROUND_NO_EXC>(0b0101010101010101, a, b);
+        let e = _mm512_setr_ps(
+            1.0, 0.0, 3.0, 0.0, 5.0, 0.0, 7.0, 0.0, 9.0, 0.0, 11.0, 0.0, 13.0, 0.0, 15.0, 0.0,
+        );
+        assert_eq_m512(r, e);
+
+        let r = _mm512_maskz_minmax_round_ps::<1, _MM_FROUND_NO_EXC>(0b0101010101010101, a, b);
+        let e = _mm512_setr_ps(
+            17.0, 0.0, 19.0, 0.0, 21.0, 0.0, 23.0, 0.0, 25.0, 0.0, 27.0, 0.0, 29.0, 0.0, 31.0, 0.0,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_minmax_ph() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm_minmax_ph::<0>(a, b);
+        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_minmax_ph::<1>(a, b);
+        let e = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_minmax_ph() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let src = _mm_setr_ph(20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0);
+
+        let r = _mm_mask_minmax_ph::<0>(src, 0b01010101, a, b);
+        let e = _mm_setr_ph(1.0, 30.0, 3.0, 50.0, 5.0, 70.0, 7.0, 90.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_mask_minmax_ph::<1>(src, 0b01010101, a, b);
+        let e = _mm_setr_ph(9.0, 30.0, 11.0, 50.0, 13.0, 70.0, 15.0, 90.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_minmax_ph() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm_maskz_minmax_ph::<0>(0b01010101, a, b);
+        let e = _mm_setr_ph(1.0, 0.0, 3.0, 0.0, 5.0, 0.0, 7.0, 0.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_maskz_minmax_ph::<1>(0b01010101, a, b);
+        let e = _mm_setr_ph(9.0, 0.0, 11.0, 0.0, 13.0, 0.0, 15.0, 0.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_minmax_ph() {
+        let a = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_setr_ph(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+
+        let r = _mm256_minmax_ph::<0>(a, b);
+        let e = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+
+        let r = _mm256_minmax_ph::<1>(a, b);
+        let e = _mm256_setr_ph(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_minmax_ph() {
+        let a = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_setr_ph(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm256_setr_ph(
+            20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0,
+            150.0, 160.0, 170.0,
+        );
+
+        let r = _mm256_mask_minmax_ph::<0>(src, 0b0101010101010101, a, b);
+        let e = _mm256_setr_ph(
+            1.0, 30.0, 3.0, 50.0, 5.0, 70.0, 7.0, 90.0, 9.0, 110.0, 11.0, 130.0, 13.0, 150.0, 15.0,
+            170.0,
+        );
+        assert_eq_m256h(r, e);
+
+        let r = _mm256_mask_minmax_ph::<1>(src, 0b0101010101010101, a, b);
+        let e = _mm256_setr_ph(
+            17.0, 30.0, 19.0, 50.0, 21.0, 70.0, 23.0, 90.0, 25.0, 110.0, 27.0, 130.0, 29.0, 150.0,
+            31.0, 170.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_minmax_ph() {
+        let a = _mm256_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm256_setr_ph(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+
+        let r = _mm256_maskz_minmax_ph::<0>(0b0101010101010101, a, b);
+        let e = _mm256_setr_ph(
+            1.0, 0.0, 3.0, 0.0, 5.0, 0.0, 7.0, 0.0, 9.0, 0.0, 11.0, 0.0, 13.0, 0.0, 15.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+
+        let r = _mm256_maskz_minmax_ph::<1>(0b0101010101010101, a, b);
+        let e = _mm256_setr_ph(
+            17.0, 0.0, 19.0, 0.0, 21.0, 0.0, 23.0, 0.0, 25.0, 0.0, 27.0, 0.0, 29.0, 0.0, 31.0, 0.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_minmax_ph() {
+        let a = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_setr_ph(
+            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
+            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
+            61.0, 62.0, 63.0, 64.0,
+        );
+
+        let r = _mm512_minmax_ph::<0>(a, b);
+        let e = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+
+        let r = _mm512_minmax_ph::<1>(a, b);
+        let e = _mm512_setr_ph(
+            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
+            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
+            61.0, 62.0, 63.0, 64.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_minmax_ph() {
+        let a = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_setr_ph(
+            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
+            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
+            61.0, 62.0, 63.0, 64.0,
+        );
+        let src = _mm512_setr_ph(
+            65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0,
+            79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0,
+            93.0, 94.0, 95.0, 96.0,
+        );
+
+        let r = _mm512_mask_minmax_ph::<0>(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            1.0, 66.0, 3.0, 68.0, 5.0, 70.0, 7.0, 72.0, 9.0, 74.0, 11.0, 76.0, 13.0, 78.0, 15.0,
+            80.0, 17.0, 82.0, 19.0, 84.0, 21.0, 86.0, 23.0, 88.0, 25.0, 90.0, 27.0, 92.0, 29.0,
+            94.0, 31.0, 96.0,
+        );
+        assert_eq_m512h(r, e);
+
+        let r = _mm512_mask_minmax_ph::<1>(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            33.0, 66.0, 35.0, 68.0, 37.0, 70.0, 39.0, 72.0, 41.0, 74.0, 43.0, 76.0, 45.0, 78.0,
+            47.0, 80.0, 49.0, 82.0, 51.0, 84.0, 53.0, 86.0, 55.0, 88.0, 57.0, 90.0, 59.0, 92.0,
+            61.0, 94.0, 63.0, 96.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_minmax_ph() {
+        let a = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_setr_ph(
+            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
+            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
+            61.0, 62.0, 63.0, 64.0,
+        );
+
+        let r = _mm512_maskz_minmax_ph::<0>(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            1.0, 0.0, 3.0, 0.0, 5.0, 0.0, 7.0, 0.0, 9.0, 0.0, 11.0, 0.0, 13.0, 0.0, 15.0, 0.0,
+            17.0, 0.0, 19.0, 0.0, 21.0, 0.0, 23.0, 0.0, 25.0, 0.0, 27.0, 0.0, 29.0, 0.0, 31.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+
+        let r = _mm512_maskz_minmax_ph::<1>(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_setr_ph(
+            33.0, 0.0, 35.0, 0.0, 37.0, 0.0, 39.0, 0.0, 41.0, 0.0, 43.0, 0.0, 45.0, 0.0, 47.0, 0.0,
+            49.0, 0.0, 51.0, 0.0, 53.0, 0.0, 55.0, 0.0, 57.0, 0.0, 59.0, 0.0, 61.0, 0.0, 63.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_minmax_round_ph() {
+        let a = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_setr_ph(
+            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
+            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
+            61.0, 62.0, 63.0, 64.0,
+        );
+
+        let r = _mm512_minmax_round_ph::<0, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+
+        let r = _mm512_minmax_round_ph::<1, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm512_setr_ph(
+            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
+            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
+            61.0, 62.0, 63.0, 64.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_minmax_round_ph() {
+        let a = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_setr_ph(
+            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
+            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
+            61.0, 62.0, 63.0, 64.0,
+        );
+        let src = _mm512_setr_ph(
+            65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0,
+            79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0,
+            93.0, 94.0, 95.0, 96.0,
+        );
+
+        let r = _mm512_mask_minmax_round_ph::<0, _MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            1.0, 66.0, 3.0, 68.0, 5.0, 70.0, 7.0, 72.0, 9.0, 74.0, 11.0, 76.0, 13.0, 78.0, 15.0,
+            80.0, 17.0, 82.0, 19.0, 84.0, 21.0, 86.0, 23.0, 88.0, 25.0, 90.0, 27.0, 92.0, 29.0,
+            94.0, 31.0, 96.0,
+        );
+        assert_eq_m512h(r, e);
+
+        let r = _mm512_mask_minmax_round_ph::<1, _MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            33.0, 66.0, 35.0, 68.0, 37.0, 70.0, 39.0, 72.0, 41.0, 74.0, 43.0, 76.0, 45.0, 78.0,
+            47.0, 80.0, 49.0, 82.0, 51.0, 84.0, 53.0, 86.0, 55.0, 88.0, 57.0, 90.0, 59.0, 92.0,
+            61.0, 94.0, 63.0, 96.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_minmax_round_ph() {
+        let a = _mm512_setr_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let b = _mm512_setr_ph(
+            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
+            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
+            61.0, 62.0, 63.0, 64.0,
+        );
+
+        let r = _mm512_maskz_minmax_round_ph::<0, _MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            1.0, 0.0, 3.0, 0.0, 5.0, 0.0, 7.0, 0.0, 9.0, 0.0, 11.0, 0.0, 13.0, 0.0, 15.0, 0.0,
+            17.0, 0.0, 19.0, 0.0, 21.0, 0.0, 23.0, 0.0, 25.0, 0.0, 27.0, 0.0, 29.0, 0.0, 31.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+
+        let r = _mm512_maskz_minmax_round_ph::<1, _MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_setr_ph(
+            33.0, 0.0, 35.0, 0.0, 37.0, 0.0, 39.0, 0.0, 41.0, 0.0, 43.0, 0.0, 45.0, 0.0, 47.0, 0.0,
+            49.0, 0.0, 51.0, 0.0, 53.0, 0.0, 55.0, 0.0, 57.0, 0.0, 59.0, 0.0, 61.0, 0.0, 63.0, 0.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    // FIXME: the following tests do not pass due to a LLVM miscompilation bug. See llvm/llvm-project#184245
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_minmax_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+
+        let r = _mm_minmax_sd::<0>(a, b);
+        let e = _mm_setr_pd(1.0, 2.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_minmax_sd::<1>(a, b);
+        let e = _mm_setr_pd(3.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_minmax_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+        let src = _mm_setr_pd(20.0, 30.0);
+
+        let r = _mm_mask_minmax_sd::<0>(src, 1, a, b);
+        let e = _mm_setr_pd(1.0, 2.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_mask_minmax_sd::<0>(src, 0, a, b);
+        let e = _mm_setr_pd(20.0, 2.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_mask_minmax_sd::<1>(src, 1, a, b);
+        let e = _mm_setr_pd(3.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_minmax_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+
+        let r = _mm_maskz_minmax_sd::<0>(1, a, b);
+        let e = _mm_setr_pd(1.0, 2.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_maskz_minmax_sd::<0>(0, a, b);
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_maskz_minmax_sd::<1>(1, a, b);
+        let e = _mm_setr_pd(3.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_minmax_round_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+
+        let r = _mm_minmax_round_sd::<0, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_pd(1.0, 2.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_minmax_round_sd::<1, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_pd(3.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_minmax_round_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+        let src = _mm_setr_pd(20.0, 30.0);
+
+        let r = _mm_mask_minmax_round_sd::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_pd(1.0, 2.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_mask_minmax_round_sd::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_pd(20.0, 2.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_mask_minmax_round_sd::<1, _MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_pd(3.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_minmax_round_sd() {
+        let a = _mm_setr_pd(1.0, 2.0);
+        let b = _mm_setr_pd(3.0, 4.0);
+
+        let r = _mm_maskz_minmax_round_sd::<0, _MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_pd(1.0, 2.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_maskz_minmax_round_sd::<0, _MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+
+        let r = _mm_maskz_minmax_round_sd::<1, _MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_pd(3.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_minmax_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm_minmax_ss::<0>(a, b);
+        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_minmax_ss::<1>(a, b);
+        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_minmax_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let src = _mm_setr_ps(20.0, 30.0, 40.0, 50.0);
+
+        let r = _mm_mask_minmax_ss::<0>(src, 1, a, b);
+        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_mask_minmax_ss::<0>(src, 0, a, b);
+        let e = _mm_setr_ps(20.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_mask_minmax_ss::<1>(src, 1, a, b);
+        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_minmax_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm_maskz_minmax_ss::<0>(1, a, b);
+        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_maskz_minmax_ss::<0>(0, a, b);
+        let e = _mm_setr_ps(0.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_maskz_minmax_ss::<1>(1, a, b);
+        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_minmax_round_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm_minmax_round_ss::<0, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_minmax_round_ss::<1, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_minmax_round_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+        let src = _mm_setr_ps(20.0, 30.0, 40.0, 50.0);
+
+        let r = _mm_mask_minmax_round_ss::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_mask_minmax_round_ss::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_ps(20.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_mask_minmax_round_ss::<1, _MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_minmax_round_ss() {
+        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
+
+        let r = _mm_maskz_minmax_round_ss::<0, _MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_maskz_minmax_round_ss::<0, _MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_ps(0.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+
+        let r = _mm_maskz_minmax_round_ss::<1, _MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_minmax_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm_minmax_sh::<0>(a, b);
+        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_minmax_sh::<1>(a, b);
+        let e = _mm_setr_ph(9.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_minmax_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let src = _mm_setr_ph(20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0);
+
+        let r = _mm_mask_minmax_sh::<0>(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_mask_minmax_sh::<0>(src, 0, a, b);
+        let e = _mm_setr_ph(20.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_mask_minmax_sh::<1>(src, 1, a, b);
+        let e = _mm_setr_ph(9.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_minmax_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm_maskz_minmax_sh::<0>(1, a, b);
+        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_maskz_minmax_sh::<0>(0, a, b);
+        let e = _mm_setr_ph(0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_maskz_minmax_sh::<1>(1, a, b);
+        let e = _mm_setr_ph(9.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_minmax_round_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm_minmax_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_minmax_round_sh::<1, _MM_FROUND_NO_EXC>(a, b);
+        let e = _mm_setr_ph(9.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_minmax_round_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let src = _mm_setr_ph(20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0);
+
+        let r = _mm_mask_minmax_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_mask_minmax_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
+        let e = _mm_setr_ph(20.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_mask_minmax_round_sh::<1, _MM_FROUND_NO_EXC>(src, 1, a, b);
+        let e = _mm_setr_ph(9.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_minmax_round_sh() {
+        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+
+        let r = _mm_maskz_minmax_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_maskz_minmax_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
+        let e = _mm_setr_ph(0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+
+        let r = _mm_maskz_minmax_round_sh::<1, _MM_FROUND_NO_EXC>(1, a, b);
+        let e = _mm_setr_ph(9.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    // Small helpers to create expected vectors without having to cast every element in the test cases.
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    fn _mm_setb_epi16(e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8) -> __m128i {
+        _mm_set_epi16(
+            e7 as u8 as i16,
+            e6 as u8 as i16,
+            e5 as u8 as i16,
+            e4 as u8 as i16,
+            e3 as u8 as i16,
+            e2 as u8 as i16,
+            e1 as u8 as i16,
+            e0 as u8 as i16,
+        )
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx")]
+    fn _mm256_setb_epi16(
+        e00: i8,
+        e01: i8,
+        e02: i8,
+        e03: i8,
+        e04: i8,
+        e05: i8,
+        e06: i8,
+        e07: i8,
+        e08: i8,
+        e09: i8,
+        e10: i8,
+        e11: i8,
+        e12: i8,
+        e13: i8,
+        e14: i8,
+        e15: i8,
+    ) -> __m256i {
+        _mm256_set_epi16(
+            e00 as u8 as i16,
+            e01 as u8 as i16,
+            e02 as u8 as i16,
+            e03 as u8 as i16,
+            e04 as u8 as i16,
+            e05 as u8 as i16,
+            e06 as u8 as i16,
+            e07 as u8 as i16,
+            e08 as u8 as i16,
+            e09 as u8 as i16,
+            e10 as u8 as i16,
+            e11 as u8 as i16,
+            e12 as u8 as i16,
+            e13 as u8 as i16,
+            e14 as u8 as i16,
+            e15 as u8 as i16,
+        )
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    fn _mm512_setb_epi16(
+        e31: i8,
+        e30: i8,
+        e29: i8,
+        e28: i8,
+        e27: i8,
+        e26: i8,
+        e25: i8,
+        e24: i8,
+        e23: i8,
+        e22: i8,
+        e21: i8,
+        e20: i8,
+        e19: i8,
+        e18: i8,
+        e17: i8,
+        e16: i8,
+        e15: i8,
+        e14: i8,
+        e13: i8,
+        e12: i8,
+        e11: i8,
+        e10: i8,
+        e09: i8,
+        e08: i8,
+        e07: i8,
+        e06: i8,
+        e05: i8,
+        e04: i8,
+        e03: i8,
+        e02: i8,
+        e01: i8,
+        e00: i8,
+    ) -> __m512i {
+        _mm512_set_epi16(
+            e31 as u8 as i16,
+            e30 as u8 as i16,
+            e29 as u8 as i16,
+            e28 as u8 as i16,
+            e27 as u8 as i16,
+            e26 as u8 as i16,
+            e25 as u8 as i16,
+            e24 as u8 as i16,
+            e23 as u8 as i16,
+            e22 as u8 as i16,
+            e21 as u8 as i16,
+            e20 as u8 as i16,
+            e19 as u8 as i16,
+            e18 as u8 as i16,
+            e17 as u8 as i16,
+            e16 as u8 as i16,
+            e15 as u8 as i16,
+            e14 as u8 as i16,
+            e13 as u8 as i16,
+            e12 as u8 as i16,
+            e11 as u8 as i16,
+            e10 as u8 as i16,
+            e09 as u8 as i16,
+            e08 as u8 as i16,
+            e07 as u8 as i16,
+            e06 as u8 as i16,
+            e05 as u8 as i16,
+            e04 as u8 as i16,
+            e03 as u8 as i16,
+            e02 as u8 as i16,
+            e01 as u8 as i16,
+            e00 as u8 as i16,
+        )
+    }
+
+    #[inline]
+    #[target_feature(enable = "sse2")]
+    fn _mm_setb_epi32(e3: i8, e2: i8, e1: i8, e0: i8) -> __m128i {
+        _mm_set_epi32(
+            e3 as u8 as i32,
+            e2 as u8 as i32,
+            e1 as u8 as i32,
+            e0 as u8 as i32,
+        )
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx")]
+    fn _mm256_setb_epi32(
+        e00: i8,
+        e01: i8,
+        e02: i8,
+        e03: i8,
+        e04: i8,
+        e05: i8,
+        e06: i8,
+        e07: i8,
+    ) -> __m256i {
+        _mm256_set_epi32(
+            e00 as u8 as i32,
+            e01 as u8 as i32,
+            e02 as u8 as i32,
+            e03 as u8 as i32,
+            e04 as u8 as i32,
+            e05 as u8 as i32,
+            e06 as u8 as i32,
+            e07 as u8 as i32,
+        )
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    fn _mm512_setb_epi32(
+        e15: i8,
+        e14: i8,
+        e13: i8,
+        e12: i8,
+        e11: i8,
+        e10: i8,
+        e09: i8,
+        e08: i8,
+        e07: i8,
+        e06: i8,
+        e05: i8,
+        e04: i8,
+        e03: i8,
+        e02: i8,
+        e01: i8,
+        e00: i8,
+    ) -> __m512i {
+        _mm512_set_epi32(
+            e15 as u8 as i32,
+            e14 as u8 as i32,
+            e13 as u8 as i32,
+            e12 as u8 as i32,
+            e11 as u8 as i32,
+            e10 as u8 as i32,
+            e09 as u8 as i32,
+            e08 as u8 as i32,
+            e07 as u8 as i32,
+            e06 as u8 as i32,
+            e05 as u8 as i32,
+            e04 as u8 as i32,
+            e03 as u8 as i32,
+            e02 as u8 as i32,
+            e01 as u8 as i32,
+            e00 as u8 as i32,
+        )
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512fp16")]
+    fn _mm256_set1_m128h(a: __m128h) -> __m256h {
+        unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]) }
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512fp16")]
+    fn _mm512_set1_m128h(a: __m128h) -> __m512h {
+        unsafe {
+            simd_shuffle!(
+                a,
+                a,
+                [
+                    0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1,
+                    2, 3, 4, 5, 6, 7
+                ]
+            )
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    fn _mm512_set1_m256(a: __m256) -> __m512 {
+        unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]) }
+    }
+
+    #[target_feature(enable = "avx512fp16,avx512bw,avx512vl")]
+    fn _mm_mask_mov_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+        _mm_castsi128_ph(_mm_mask_mov_epi16(
+            _mm_castph_si128(src),
+            k,
+            _mm_castph_si128(a),
+        ))
+    }
+
+    #[target_feature(enable = "avx512fp16,avx512bw,avx512vl")]
+    fn _mm_maskz_mov_ph(k: __mmask8, a: __m128h) -> __m128h {
+        _mm_castsi128_ph(_mm_maskz_mov_epi16(k, _mm_castph_si128(a)))
+    }
+
+    #[target_feature(enable = "avx512fp16,avx512bw,avx512vl")]
+    fn _mm256_mask_mov_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+        _mm256_castsi256_ph(_mm256_mask_mov_epi16(
+            _mm256_castph_si256(src),
+            k,
+            _mm256_castph_si256(a),
+        ))
+    }
+
+    #[target_feature(enable = "avx512fp16,avx512bw,avx512vl")]
+    fn _mm256_maskz_mov_ph(k: __mmask16, a: __m256h) -> __m256h {
+        _mm256_castsi256_ph(_mm256_maskz_mov_epi16(k, _mm256_castph_si256(a)))
+    }
+
+    #[target_feature(enable = "avx512fp16,avx512bw")]
+    fn _mm512_mask_mov_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+        _mm512_castsi512_ph(_mm512_mask_mov_epi16(
+            _mm512_castph_si512(src),
+            k,
+            _mm512_castph_si512(a),
+        ))
+    }
+
+    #[target_feature(enable = "avx512fp16,avx512bw")]
+    fn _mm512_maskz_mov_ph(k: __mmask32, a: __m512h) -> __m512h {
+        _mm512_castsi512_ph(_mm512_maskz_mov_epi16(k, _mm512_castph_si512(a)))
+    }
+
+    // FIXME: the following tests do not pass due to a LLVM miscompilation bug. See llvm/llvm-project#184251
+
+    const CVTPH2IBS_INPUT: __m128h =
+        unsafe { _mm_set_ph(1.0, -2.5, 127.0, 128.0, -128.0, 255.0, 0.0, f16::NAN) };
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_ipcvts_ph_epi8() {
+        let a = CVTPH2IBS_INPUT;
+        let r = _mm_ipcvts_ph_epi8(a);
+        let e = _mm_setb_epi16(1, -2, 127, 127, -128, 127, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_ipcvts_ph_epi8() {
+        let a = CVTPH2IBS_INPUT;
+        let src = _mm_set_epi16(10, 20, 30, 40, 50, 60, 70, 80);
+        let r = _mm_mask_ipcvts_ph_epi8(src, 0b01010101, a);
+        let e = _mm_setb_epi16(10, -2, 30, 127, 50, 127, 70, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_ipcvts_ph_epi8() {
+        let a = CVTPH2IBS_INPUT;
+        let r = _mm_maskz_ipcvts_ph_epi8(0b01010101, a);
+        let e = _mm_setb_epi16(0, -2, 0, 127, 0, 127, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_ipcvts_ph_epi8() {
+        let a = _mm256_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm256_ipcvts_ph_epi8(a);
+        let e = _mm256_setb_epi16(
+            1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_ipcvts_ph_epi8() {
+        let a = _mm256_set1_m128h(CVTPH2IBS_INPUT);
+        let src = _mm256_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm256_mask_ipcvts_ph_epi8(src, 0b0101010101010101, a);
+        let e = _mm256_setb_epi16(
+            10, -2, 30, 127, 50, 127, 70, 0, 15, -2, 35, 127, 55, 127, 75, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_ipcvts_ph_epi8() {
+        let a = _mm256_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm256_maskz_ipcvts_ph_epi8(0b0101010101010101, a);
+        let e = _mm256_setb_epi16(0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvts_ph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm512_ipcvts_ph_epi8(a);
+        let e = _mm512_setb_epi16(
+            1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127,
+            -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvts_ph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let src = _mm512_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85, 80, 70, 60, 50, 40, 30,
+            20, 10, 85, 75, 65, 55, 45, 35, 25, 15,
+        );
+        let r = _mm512_mask_ipcvts_ph_epi8(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_setb_epi16(
+            10, -2, 30, 127, 50, 127, 70, 0, 15, -2, 35, 127, 55, 127, 75, 0, 80, -2, 60, 127, 40,
+            127, 20, 0, 85, -2, 65, 127, 45, 127, 25, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvts_ph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm512_maskz_ipcvts_ph_epi8(0b01010101010101010101010101010101, a);
+        let e = _mm512_setb_epi16(
+            0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0,
+            0, -2, 0, 127, 0, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvts_roundph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm512_ipcvts_roundph_epi8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setb_epi16(
+            1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127,
+            -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvts_roundph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let src = _mm512_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85, 80, 70, 60, 50, 40, 30,
+            20, 10, 85, 75, 65, 55, 45, 35, 25, 15,
+        );
+        let r = _mm512_mask_ipcvts_roundph_epi8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi16(
+            10, -2, 30, 127, 50, 127, 70, 0, 15, -2, 35, 127, 55, 127, 75, 0, 80, -2, 60, 127, 40,
+            127, 20, 0, 85, -2, 65, 127, 45, 127, 25, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvts_roundph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm512_maskz_ipcvts_roundph_epi8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi16(
+            0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0,
+            0, -2, 0, 127, 0, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    const CVTPH2IUBS_INPUT: __m128h =
+        unsafe { _mm_set_ph(1.0, -2.5, 255.0, 256.0, -256.0, 511.0, 0.0, f16::NAN) };
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_ipcvts_ph_epu8() {
+        let a = CVTPH2IUBS_INPUT;
+        let r = _mm_ipcvts_ph_epu8(a);
+        let e = _mm_setb_epi16(1, 0, -1, -1, 0, -1, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_ipcvts_ph_epu8() {
+        let a = CVTPH2IUBS_INPUT;
+        let src = _mm_set_epi16(10, 20, 30, 40, 50, 60, 70, 80);
+        let r = _mm_mask_ipcvts_ph_epu8(src, 0b01010101, a);
+        let e = _mm_setb_epi16(10, 0, 30, -1, 50, -1, 70, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_ipcvts_ph_epu8() {
+        let a = CVTPH2IUBS_INPUT;
+        let r = _mm_maskz_ipcvts_ph_epu8(0b01010101, a);
+        let e = _mm_setb_epi16(0, 0, 0, -1, 0, -1, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_ipcvts_ph_epu8() {
+        let a = _mm256_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm256_ipcvts_ph_epu8(a);
+        let e = _mm256_setb_epi16(1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_ipcvts_ph_epu8() {
+        let a = _mm256_set1_m128h(CVTPH2IUBS_INPUT);
+        let src = _mm256_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm256_mask_ipcvts_ph_epu8(src, 0b0101010101010101, a);
+        let e = _mm256_setb_epi16(10, 0, 30, -1, 50, -1, 70, 0, 15, 0, 35, -1, 55, -1, 75, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_ipcvts_ph_epu8() {
+        let a = _mm256_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm256_maskz_ipcvts_ph_epu8(0b0101010101010101, a);
+        let e = _mm256_setb_epi16(0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvts_ph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm512_ipcvts_ph_epu8(a);
+        let e = _mm512_setb_epi16(
+            1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0, 1, 0,
+            -1, -1, 0, -1, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvts_ph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let src = _mm512_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85, 80, 70, 60, 50, 40, 30,
+            20, 10, 85, 75, 65, 55, 45, 35, 25, 15,
+        );
+        let r = _mm512_mask_ipcvts_ph_epu8(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_setb_epi16(
+            10, 0, 30, -1, 50, -1, 70, 0, 15, 0, 35, -1, 55, -1, 75, 0, 80, 0, 60, -1, 40, -1, 20,
+            0, 85, 0, 65, -1, 45, -1, 25, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvts_ph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm512_maskz_ipcvts_ph_epu8(0b01010101010101010101010101010101, a);
+        let e = _mm512_setb_epi16(
+            0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0,
+            -1, 0, -1, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvts_roundph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm512_ipcvts_roundph_epu8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setb_epi16(
+            1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0, 1, 0,
+            -1, -1, 0, -1, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvts_roundph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let src = _mm512_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85, 80, 70, 60, 50, 40, 30,
+            20, 10, 85, 75, 65, 55, 45, 35, 25, 15,
+        );
+        let r = _mm512_mask_ipcvts_roundph_epu8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi16(
+            10, 0, 30, -1, 50, -1, 70, 0, 15, 0, 35, -1, 55, -1, 75, 0, 80, 0, 60, -1, 40, -1, 20,
+            0, 85, 0, 65, -1, 45, -1, 25, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvts_roundph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm512_maskz_ipcvts_roundph_epu8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi16(
+            0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0,
+            -1, 0, -1, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    const CVTPS2IBS128_INPUT: __m128 = unsafe { _mm_set_ps(1.0, -2.5, 127.0, 128.0) };
+    const CVTPS2IBS_INPUT: __m256 =
+        unsafe { _mm256_set_ps(1.0, -2.5, 127.0, 128.0, -128.0, 255.0, 0.0, f32::NAN) };
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_ipcvts_ps_epi8() {
+        let a = CVTPS2IBS128_INPUT;
+        let r = _mm_ipcvts_ps_epi8(a);
+        let e = _mm_setb_epi32(1, -2, 127, 127);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_ipcvts_ps_epi8() {
+        let a = CVTPS2IBS128_INPUT;
+        let src = _mm_set_epi32(10, 20, 30, 40);
+        let r = _mm_mask_ipcvts_ps_epi8(src, 0b0101, a);
+        let e = _mm_setb_epi32(10, -2, 30, 127);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_ipcvts_ps_epi8() {
+        let a = CVTPS2IBS128_INPUT;
+        let r = _mm_maskz_ipcvts_ps_epi8(0b0101, a);
+        let e = _mm_setb_epi32(0, -2, 0, 127);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_ipcvts_ps_epi8() {
+        let a = CVTPS2IBS_INPUT;
+        let r = _mm256_ipcvts_ps_epi8(a);
+        let e = _mm256_setb_epi32(1, -2, 127, 127, -128, 127, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_ipcvts_ps_epi8() {
+        let a = CVTPS2IBS_INPUT;
+        let src = _mm256_set_epi32(10, 20, 30, 40, 50, 60, 70, 80);
+        let r = _mm256_mask_ipcvts_ps_epi8(src, 0b01010101, a);
+        let e = _mm256_setb_epi32(10, -2, 30, 127, 50, 127, 70, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_ipcvts_ps_epi8() {
+        let a = CVTPS2IBS_INPUT;
+        let r = _mm256_maskz_ipcvts_ps_epi8(0b01010101, a);
+        let e = _mm256_setb_epi32(0, -2, 0, 127, 0, 127, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvts_ps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let r = _mm512_ipcvts_ps_epi8(a);
+        let e = _mm512_setb_epi32(
+            1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvts_ps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let src = _mm512_set_epi32(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm512_mask_ipcvts_ps_epi8(src, 0b0101010101010101, a);
+        let e = _mm512_setb_epi32(
+            10, -2, 30, 127, 50, 127, 70, 0, 15, -2, 35, 127, 55, 127, 75, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvts_ps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let r = _mm512_maskz_ipcvts_ps_epi8(0b0101010101010101, a);
+        let e = _mm512_setb_epi32(0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvts_roundps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let r = _mm512_ipcvts_roundps_epi8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setb_epi32(
+            1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvts_roundps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let src = _mm512_set_epi32(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm512_mask_ipcvts_roundps_epi8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi32(
+            10, -2, 30, 127, 50, 127, 70, 0, 15, -2, 35, 127, 55, 127, 75, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvts_roundps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let r = _mm512_maskz_ipcvts_roundps_epi8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi32(0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    const CVTPS2IUBS128_INPUT: __m128 = unsafe { _mm_set_ps(1.0, -2.5, 255.0, 256.0) };
+    const CVTPS2IUBS_INPUT: __m256 =
+        unsafe { _mm256_set_ps(1.0, -2.5, 255.0, 256.0, -256.0, 511.0, 0.0, f32::NAN) };
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_ipcvts_ps_epu8() {
+        let a = CVTPS2IUBS128_INPUT;
+        let r = _mm_ipcvts_ps_epu8(a);
+        let e = _mm_setb_epi32(1, 0, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_ipcvts_ps_epu8() {
+        let a = CVTPS2IUBS128_INPUT;
+        let src = _mm_set_epi32(10, 20, 30, 40);
+        let r = _mm_mask_ipcvts_ps_epu8(src, 0b0101, a);
+        let e = _mm_setb_epi32(10, 0, 30, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_ipcvts_ps_epu8() {
+        let a = CVTPS2IUBS128_INPUT;
+        let r = _mm_maskz_ipcvts_ps_epu8(0b0101, a);
+        let e = _mm_setb_epi32(0, 0, 0, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_ipcvts_ps_epu8() {
+        let a = CVTPS2IUBS_INPUT;
+        let r = _mm256_ipcvts_ps_epu8(a);
+        let e = _mm256_setb_epi32(1, 0, -1, -1, 0, -1, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_ipcvts_ps_epu8() {
+        let a = CVTPS2IUBS_INPUT;
+        let src = _mm256_set_epi32(10, 20, 30, 40, 50, 60, 70, 80);
+        let r = _mm256_mask_ipcvts_ps_epu8(src, 0b01010101, a);
+        let e = _mm256_setb_epi32(10, 0, 30, -1, 50, -1, 70, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_ipcvts_ps_epu8() {
+        let a = CVTPS2IUBS_INPUT;
+        let r = _mm256_maskz_ipcvts_ps_epu8(0b01010101, a);
+        let e = _mm256_setb_epi32(0, 0, 0, -1, 0, -1, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvts_ps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let r = _mm512_ipcvts_ps_epu8(a);
+        let e = _mm512_setb_epi32(1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvts_ps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let src = _mm512_set_epi32(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm512_mask_ipcvts_ps_epu8(src, 0b0101010101010101, a);
+        let e = _mm512_setb_epi32(10, 0, 30, -1, 50, -1, 70, 0, 15, 0, 35, -1, 55, -1, 75, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvts_ps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let r = _mm512_maskz_ipcvts_ps_epu8(0b0101010101010101, a);
+        let e = _mm512_setb_epi32(0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvts_roundps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let r = _mm512_ipcvts_roundps_epu8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
+        let e = _mm512_setb_epi32(1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvts_roundps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let src = _mm512_set_epi32(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm512_mask_ipcvts_roundps_epu8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi32(10, 0, 30, -1, 50, -1, 70, 0, 15, 0, 35, -1, 55, -1, 75, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvts_roundps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let r = _mm512_maskz_ipcvts_roundps_epu8::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b0101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi32(0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_ipcvtts_ph_epi8() {
+        let a = CVTPH2IBS_INPUT;
+        let r = _mm_ipcvtts_ph_epi8(a);
+        let e = _mm_setb_epi16(1, -2, 127, 127, -128, 127, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_ipcvtts_ph_epi8() {
+        let a = CVTPH2IBS_INPUT;
+        let src = _mm_set_epi16(10, 20, 30, 40, 50, 60, 70, 80);
+        let r = _mm_mask_ipcvtts_ph_epi8(src, 0b01010101, a);
+        let e = _mm_setb_epi16(10, -2, 30, 127, 50, 127, 70, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_ipcvtts_ph_epi8() {
+        let a = CVTPH2IBS_INPUT;
+        let r = _mm_maskz_ipcvtts_ph_epi8(0b01010101, a);
+        let e = _mm_setb_epi16(0, -2, 0, 127, 0, 127, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_ipcvtts_ph_epi8() {
+        let a = _mm256_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm256_ipcvtts_ph_epi8(a);
+        let e = _mm256_setb_epi16(
+            1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_ipcvtts_ph_epi8() {
+        let a = _mm256_set1_m128h(CVTPH2IBS_INPUT);
+        let src = _mm256_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm256_mask_ipcvtts_ph_epi8(src, 0b0101010101010101, a);
+        let e = _mm256_setb_epi16(
+            10, -2, 30, 127, 50, 127, 70, 0, 15, -2, 35, 127, 55, 127, 75, 0,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_ipcvtts_ph_epi8() {
+        let a = _mm256_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm256_maskz_ipcvtts_ph_epi8(0b0101010101010101, a);
+        let e = _mm256_setb_epi16(0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvtts_ph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm512_ipcvtts_ph_epi8(a);
+        let e = _mm512_setb_epi16(
+            1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127,
+            -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvtts_ph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let src = _mm512_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85, 80, 70, 60, 50, 40, 30,
+            20, 10, 85, 75, 65, 55, 45, 35, 25, 15,
+        );
+        let r = _mm512_mask_ipcvtts_ph_epi8(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_setb_epi16(
+            10, -2, 30, 127, 50, 127, 70, 0, 15, -2, 35, 127, 55, 127, 75, 0, 80, -2, 60, 127, 40,
+            127, 20, 0, 85, -2, 65, 127, 45, 127, 25, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvtts_ph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm512_maskz_ipcvtts_ph_epi8(0b01010101010101010101010101010101, a);
+        let e = _mm512_setb_epi16(
+            0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0,
+            0, -2, 0, 127, 0, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvtts_roundph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm512_ipcvtts_roundph_epi8::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setb_epi16(
+            1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127,
+            -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvtts_roundph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let src = _mm512_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85, 80, 70, 60, 50, 40, 30,
+            20, 10, 85, 75, 65, 55, 45, 35, 25, 15,
+        );
+        let r = _mm512_mask_ipcvtts_roundph_epi8::<_MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi16(
+            10, -2, 30, 127, 50, 127, 70, 0, 15, -2, 35, 127, 55, 127, 75, 0, 80, -2, 60, 127, 40,
+            127, 20, 0, 85, -2, 65, 127, 45, 127, 25, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvtts_roundph_epi8() {
+        let a = _mm512_set1_m128h(CVTPH2IBS_INPUT);
+        let r = _mm512_maskz_ipcvtts_roundph_epi8::<_MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi16(
+            0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0,
+            0, -2, 0, 127, 0, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_ipcvtts_ph_epu8() {
+        let a = CVTPH2IUBS_INPUT;
+        let r = _mm_ipcvtts_ph_epu8(a);
+        let e = _mm_setb_epi16(1, 0, -1, -1, 0, -1, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_ipcvtts_ph_epu8() {
+        let a = CVTPH2IUBS_INPUT;
+        let src = _mm_set_epi16(10, 20, 30, 40, 50, 60, 70, 80);
+        let r = _mm_mask_ipcvtts_ph_epu8(src, 0b01010101, a);
+        let e = _mm_setb_epi16(10, 0, 30, -1, 50, -1, 70, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_ipcvtts_ph_epu8() {
+        let a = CVTPH2IUBS_INPUT;
+        let r = _mm_maskz_ipcvtts_ph_epu8(0b01010101, a);
+        let e = _mm_setb_epi16(0, 0, 0, -1, 0, -1, 0, 0);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_ipcvtts_ph_epu8() {
+        let a = _mm256_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm256_ipcvtts_ph_epu8(a);
+        let e = _mm256_setb_epi16(1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_ipcvtts_ph_epu8() {
+        let a = _mm256_set1_m128h(CVTPH2IUBS_INPUT);
+        let src = _mm256_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm256_mask_ipcvtts_ph_epu8(src, 0b0101010101010101, a);
+        let e = _mm256_setb_epi16(10, 0, 30, -1, 50, -1, 70, 0, 15, 0, 35, -1, 55, -1, 75, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_ipcvtts_ph_epu8() {
+        let a = _mm256_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm256_maskz_ipcvtts_ph_epu8(0b0101010101010101, a);
+        let e = _mm256_setb_epi16(0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvtts_ph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm512_ipcvtts_ph_epu8(a);
+        let e = _mm512_setb_epi16(
+            1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0, 1, 0,
+            -1, -1, 0, -1, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvtts_ph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let src = _mm512_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85, 80, 70, 60, 50, 40, 30,
+            20, 10, 85, 75, 65, 55, 45, 35, 25, 15,
+        );
+        let r = _mm512_mask_ipcvtts_ph_epu8(src, 0b01010101010101010101010101010101, a);
+        let e = _mm512_setb_epi16(
+            10, 0, 30, -1, 50, -1, 70, 0, 15, 0, 35, -1, 55, -1, 75, 0, 80, 0, 60, -1, 40, -1, 20,
+            0, 85, 0, 65, -1, 45, -1, 25, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvtts_ph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm512_maskz_ipcvtts_ph_epu8(0b01010101010101010101010101010101, a);
+        let e = _mm512_setb_epi16(
+            0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0,
+            -1, 0, -1, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvtts_roundph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm512_ipcvtts_roundph_epu8::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setb_epi16(
+            1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0, 1, 0,
+            -1, -1, 0, -1, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvtts_roundph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let src = _mm512_set_epi16(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85, 80, 70, 60, 50, 40, 30,
+            20, 10, 85, 75, 65, 55, 45, 35, 25, 15,
+        );
+        let r = _mm512_mask_ipcvtts_roundph_epu8::<_MM_FROUND_NO_EXC>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi16(
+            10, 0, 30, -1, 50, -1, 70, 0, 15, 0, 35, -1, 55, -1, 75, 0, 80, 0, 60, -1, 40, -1, 20,
+            0, 85, 0, 65, -1, 45, -1, 25, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvtts_roundph_epu8() {
+        let a = _mm512_set1_m128h(CVTPH2IUBS_INPUT);
+        let r = _mm512_maskz_ipcvtts_roundph_epu8::<_MM_FROUND_NO_EXC>(
+            0b01010101010101010101010101010101,
+            a,
+        );
+        let e = _mm512_setb_epi16(
+            0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0,
+            -1, 0, -1, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_ipcvtts_ps_epi8() {
+        let a = CVTPS2IBS128_INPUT;
+        let r = _mm_ipcvtts_ps_epi8(a);
+        let e = _mm_setb_epi32(1, -2, 127, 127);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_ipcvtts_ps_epi8() {
+        let a = CVTPS2IBS128_INPUT;
+        let src = _mm_set_epi32(10, 20, 30, 40);
+        let r = _mm_mask_ipcvtts_ps_epi8(src, 0b0101, a);
+        let e = _mm_setb_epi32(10, -2, 30, 127);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_ipcvtts_ps_epi8() {
+        let a = CVTPS2IBS128_INPUT;
+        let r = _mm_maskz_ipcvtts_ps_epi8(0b0101, a);
+        let e = _mm_setb_epi32(0, -2, 0, 127);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_ipcvtts_ps_epi8() {
+        let a = CVTPS2IBS_INPUT;
+        let r = _mm256_ipcvtts_ps_epi8(a);
+        let e = _mm256_setb_epi32(1, -2, 127, 127, -128, 127, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_ipcvtts_ps_epi8() {
+        let a = CVTPS2IBS_INPUT;
+        let src = _mm256_set_epi32(10, 20, 30, 40, 50, 60, 70, 80);
+        let r = _mm256_mask_ipcvtts_ps_epi8(src, 0b01010101, a);
+        let e = _mm256_setb_epi32(10, -2, 30, 127, 50, 127, 70, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_ipcvtts_ps_epi8() {
+        let a = CVTPS2IBS_INPUT;
+        let r = _mm256_maskz_ipcvtts_ps_epi8(0b01010101, a);
+        let e = _mm256_setb_epi32(0, -2, 0, 127, 0, 127, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvtts_ps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let r = _mm512_ipcvtts_ps_epi8(a);
+        let e = _mm512_setb_epi32(
+            1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvtts_ps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let src = _mm512_set_epi32(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm512_mask_ipcvtts_ps_epi8(src, 0b0101010101010101, a);
+        let e = _mm512_setb_epi32(
+            10, -2, 30, 127, 50, 127, 70, 0, 15, -2, 35, 127, 55, 127, 75, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvtts_ps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let r = _mm512_maskz_ipcvtts_ps_epi8(0b0101010101010101, a);
+        let e = _mm512_setb_epi32(0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvtts_roundps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let r = _mm512_ipcvtts_roundps_epi8::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setb_epi32(
+            1, -2, 127, 127, -128, 127, 0, 0, 1, -2, 127, 127, -128, 127, 0, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvtts_roundps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let src = _mm512_set_epi32(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm512_mask_ipcvtts_roundps_epi8::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
+        let e = _mm512_setb_epi32(
+            10, -2, 30, 127, 50, 127, 70, 0, 15, -2, 35, 127, 55, 127, 75, 0,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvtts_roundps_epi8() {
+        let a = _mm512_set1_m256(CVTPS2IBS_INPUT);
+        let r = _mm512_maskz_ipcvtts_roundps_epi8::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
+        let e = _mm512_setb_epi32(0, -2, 0, 127, 0, 127, 0, 0, 0, -2, 0, 127, 0, 127, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_ipcvtts_ps_epu8() {
+        let a = CVTPS2IUBS128_INPUT;
+        let r = _mm_ipcvtts_ps_epu8(a);
+        let e = _mm_setb_epi32(1, 0, -1, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_ipcvtts_ps_epu8() {
+        let a = CVTPS2IUBS128_INPUT;
+        let src = _mm_set_epi32(10, 20, 30, 40);
+        let r = _mm_mask_ipcvtts_ps_epu8(src, 0b0101, a);
+        let e = _mm_setb_epi32(10, 0, 30, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_ipcvtts_ps_epu8() {
+        let a = CVTPS2IUBS128_INPUT;
+        let r = _mm_maskz_ipcvtts_ps_epu8(0b0101, a);
+        let e = _mm_setb_epi32(0, 0, 0, -1);
+        assert_eq_m128i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_ipcvtts_ps_epu8() {
+        let a = CVTPS2IUBS_INPUT;
+        let r = _mm256_ipcvtts_ps_epu8(a);
+        let e = _mm256_setb_epi32(1, 0, -1, -1, 0, -1, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_ipcvtts_ps_epu8() {
+        let a = CVTPS2IUBS_INPUT;
+        let src = _mm256_set_epi32(10, 20, 30, 40, 50, 60, 70, 80);
+        let r = _mm256_mask_ipcvtts_ps_epu8(src, 0b01010101, a);
+        let e = _mm256_setb_epi32(10, 0, 30, -1, 50, -1, 70, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_ipcvtts_ps_epu8() {
+        let a = CVTPS2IUBS_INPUT;
+        let r = _mm256_maskz_ipcvtts_ps_epu8(0b01010101, a);
+        let e = _mm256_setb_epi32(0, 0, 0, -1, 0, -1, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvtts_ps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let r = _mm512_ipcvtts_ps_epu8(a);
+        let e = _mm512_setb_epi32(1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvtts_ps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let src = _mm512_set_epi32(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm512_mask_ipcvtts_ps_epu8(src, 0b0101010101010101, a);
+        let e = _mm512_setb_epi32(10, 0, 30, -1, 50, -1, 70, 0, 15, 0, 35, -1, 55, -1, 75, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvtts_ps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let r = _mm512_maskz_ipcvtts_ps_epu8(0b0101010101010101, a);
+        let e = _mm512_setb_epi32(0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_ipcvtts_roundps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let r = _mm512_ipcvtts_roundps_epu8::<_MM_FROUND_NO_EXC>(a);
+        let e = _mm512_setb_epi32(1, 0, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, -1, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_ipcvtts_roundps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let src = _mm512_set_epi32(
+            10, 20, 30, 40, 50, 60, 70, 80, 15, 25, 35, 45, 55, 65, 75, 85,
+        );
+        let r = _mm512_mask_ipcvtts_roundps_epu8::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
+        let e = _mm512_setb_epi32(10, 0, 30, -1, 50, -1, 70, 0, 15, 0, 35, -1, 55, -1, 75, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_ipcvtts_roundps_epu8() {
+        let a = _mm512_set1_m256(CVTPS2IUBS_INPUT);
+        let r = _mm512_maskz_ipcvtts_roundps_epu8::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
+        let e = _mm512_setb_epi32(0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtx2ps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_set_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_cvtx2ps_ph(a, b);
+        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtx2ps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_set_ps(5.0, 6.0, 7.0, 8.0);
+        let src = _mm_set_ph(50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0);
+        let r = _mm_mask_cvtx2ps_ph(src, 0b01010101, a, b);
+        let e = _mm_set_ph(50.0, 2.0, 52.0, 4.0, 54.0, 6.0, 56.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtx2ps_ph() {
+        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
+        let b = _mm_set_ps(5.0, 6.0, 7.0, 8.0);
+        let r = _mm_maskz_cvtx2ps_ph(0b01010101, a, b);
+        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtx2ps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm256_set_ps(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm256_cvtx2ps_ph(a, b);
+        let e = _mm256_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtx2ps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm256_set_ps(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let src = _mm256_set_ph(
+            50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0,
+            64.0, 65.0,
+        );
+        let r = _mm256_mask_cvtx2ps_ph(src, 0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            50.0, 2.0, 52.0, 4.0, 54.0, 6.0, 56.0, 8.0, 58.0, 10.0, 60.0, 12.0, 62.0, 14.0, 64.0,
+            16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtx2ps_ph() {
+        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+        let b = _mm256_set_ps(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
+        let r = _mm256_maskz_cvtx2ps_ph(0b0101010101010101, a, b);
+        let e = _mm256_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+        );
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtx2ps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_set_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvtx2ps_ph(a, b);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtx2ps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_set_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_ph(
+            50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0,
+            64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0,
+            78.0, 79.0, 80.0, 81.0,
+        );
+        let r = _mm512_mask_cvtx2ps_ph(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            50.0, 2.0, 52.0, 4.0, 54.0, 6.0, 56.0, 8.0, 58.0, 10.0, 60.0, 12.0, 62.0, 14.0, 64.0,
+            16.0, 66.0, 18.0, 68.0, 20.0, 70.0, 22.0, 72.0, 24.0, 74.0, 26.0, 76.0, 28.0, 78.0,
+            30.0, 80.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtx2ps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_set_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvtx2ps_ph(0b01010101010101010101010101010101, a, b);
+        let e = _mm512_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+            0.0, 18.0, 0.0, 20.0, 0.0, 22.0, 0.0, 24.0, 0.0, 26.0, 0.0, 28.0, 0.0, 30.0, 0.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtx_round2ps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_set_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_cvtx_round2ps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
+        let e = _mm512_set_ph(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtx_round2ps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_set_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let src = _mm512_set_ph(
+            50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0,
+            64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0,
+            78.0, 79.0, 80.0, 81.0,
+        );
+        let r = _mm512_mask_cvtx_round2ps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src,
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            50.0, 2.0, 52.0, 4.0, 54.0, 6.0, 56.0, 8.0, 58.0, 10.0, 60.0, 12.0, 62.0, 14.0, 64.0,
+            16.0, 66.0, 18.0, 68.0, 20.0, 70.0, 22.0, 72.0, 24.0, 74.0, 26.0, 76.0, 28.0, 78.0,
+            30.0, 80.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtx_round2ps_ph() {
+        let a = _mm512_set_ps(
+            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+        );
+        let b = _mm512_set_ps(
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
+            31.0, 32.0,
+        );
+        let r = _mm512_maskz_cvtx_round2ps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b01010101010101010101010101010101,
+            a,
+            b,
+        );
+        let e = _mm512_set_ph(
+            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
+            0.0, 18.0, 0.0, 20.0, 0.0, 22.0, 0.0, 24.0, 0.0, 26.0, 0.0, 28.0, 0.0, 30.0, 0.0, 32.0,
+        );
+        assert_eq_m512h(r, e);
+    }
+
+    const BF8_M0: i8 = 0x80_u8 as i8;
+    const BF8_1: i8 = 0x3c;
+    const BF8_1P25: i8 = 0x3d;
+    const BF8_M1: i8 = 0xbc_u8 as i8;
+    const BF8_M1P25: i8 = 0xbd_u8 as i8;
+    const BF8_INF: i8 = 0x7c;
+    const BF8_MAX: i8 = 0x7b;
+    const BF8_MIN: i8 = 0x04;
+
+    const BF8_INPUT: __m128h =
+        unsafe { _mm_set_ph(-0.0, 1.0, 1.2, -1.1, 1.125, -1.125, 61440.0, f16::INFINITY) };
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtbiasph_bf8() {
+        let b = BF8_INPUT;
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_cvtbiasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(255);
+        let r = _mm_cvtbiasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1P25, BF8_1P25, BF8_M1P25,
+            BF8_INF, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(127);
+        let r = _mm_cvtbiasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX,
+            BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(128);
+        let r = _mm_cvtbiasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1P25, BF8_M1P25, BF8_INF,
+            BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtbiasph_bf8() {
+        let b = BF8_INPUT;
+        let src = _mm_set1_epi8(0x7f);
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtbiasph_bf8(src, 0b01010101, a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtbiasph_bf8() {
+        let b = BF8_INPUT;
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_maskz_cvtbiasph_bf8(0b01010101, a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtbiasph_bf8() {
+        let b = _mm256_set1_m128h(BF8_INPUT);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_cvtbiasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_INF, BF8_M0, BF8_1, BF8_1,
+            BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(255);
+        let r = _mm256_cvtbiasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1P25, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF, BF8_M0,
+            BF8_1, BF8_1P25, BF8_M1P25, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(127);
+        let r = _mm256_cvtbiasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(128);
+        let r = _mm256_cvtbiasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtbiasph_bf8() {
+        let b = _mm256_set1_m128h(BF8_INPUT);
+        let src = _mm_set1_epi8(0x7f);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_mask_cvtbiasph_bf8(src, 0b0101010101010101, a, b);
+        let e = _mm_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtbiasph_bf8() {
+        let b = _mm256_set1_m128h(BF8_INPUT);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_maskz_cvtbiasph_bf8(0b0101010101010101, a, b);
+        let e = _mm_set_epi8(
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtbiasph_bf8() {
+        let b = _mm512_set1_m128h(BF8_INPUT);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_cvtbiasph_bf8(a, b);
+        let e = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_INF, BF8_M0, BF8_1, BF8_1,
+            BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_INF, BF8_M0, BF8_1, BF8_1, BF8_M1, BF8_1, BF8_M1,
+            BF8_MAX, BF8_INF, BF8_M0, BF8_1, BF8_1, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+
+        let a = _mm512_set1_epi16(255);
+        let r = _mm512_cvtbiasph_bf8(a, b);
+        let e = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1P25, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF, BF8_M0,
+            BF8_1, BF8_1P25, BF8_M1P25, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1P25, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25,
+            BF8_M1P25, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+
+        let a = _mm512_set1_epi16(127);
+        let r = _mm512_cvtbiasph_bf8(a, b);
+        let e = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_1, BF8_M1, BF8_MAX, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1,
+            BF8_MAX, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+
+        let a = _mm512_set1_epi16(128);
+        let r = _mm512_cvtbiasph_bf8(a, b);
+        let e = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25,
+            BF8_M1, BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_1P25, BF8_M1P25, BF8_INF, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtbiasph_bf8() {
+        let b = _mm512_set1_m128h(BF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_mask_cvtbiasph_bf8(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm256_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF,
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtbiasph_bf8() {
+        let b = _mm512_set1_m128h(BF8_INPUT);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_maskz_cvtbiasph_bf8(0b01010101010101010101010101010101, a, b);
+        let e = _mm256_set_epi8(
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF,
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvts_biasph_bf8() {
+        let b = BF8_INPUT;
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_cvts_biasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(255);
+        let r = _mm_cvts_biasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1P25, BF8_1P25, BF8_M1P25,
+            BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(127);
+        let r = _mm_cvts_biasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX,
+            BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(128);
+        let r = _mm_cvts_biasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1P25, BF8_M1P25, BF8_MAX,
+            BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvts_biasph_bf8() {
+        let b = BF8_INPUT;
+        let src = _mm_set1_epi8(0x7f);
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_mask_cvts_biasph_bf8(src, 0b01010101, a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvts_biasph_bf8() {
+        let b = BF8_INPUT;
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_maskz_cvts_biasph_bf8(0b01010101, a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvts_biasph_bf8() {
+        let b = _mm256_set1_m128h(BF8_INPUT);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_cvts_biasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1,
+            BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(255);
+        let r = _mm256_cvts_biasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1P25, BF8_1P25, BF8_M1P25, BF8_MAX, BF8_MAX, BF8_M0,
+            BF8_1, BF8_1P25, BF8_M1P25, BF8_1P25, BF8_M1P25, BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(127);
+        let r = _mm256_cvts_biasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(128);
+        let r = _mm256_cvts_biasph_bf8(a, b);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1P25, BF8_M1P25, BF8_MAX, BF8_MAX, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1P25, BF8_M1P25, BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvts_biasph_bf8() {
+        let b = _mm256_set1_m128h(BF8_INPUT);
+        let src = _mm_set1_epi8(0x7f);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_mask_cvts_biasph_bf8(src, 0b0101010101010101, a, b);
+        let e = _mm_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvts_biasph_bf8() {
+        let b = _mm256_set1_m128h(BF8_INPUT);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_maskz_cvts_biasph_bf8(0b0101010101010101, a, b);
+        let e = _mm_set_epi8(
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvts_biasph_bf8() {
+        let b = _mm512_set1_m128h(BF8_INPUT);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_cvts_biasph_bf8(a, b);
+        let e = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1,
+            BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1, BF8_M1, BF8_1, BF8_M1,
+            BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvts_biasph_bf8() {
+        let b = _mm512_set1_m128h(BF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_mask_cvts_biasph_bf8(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm256_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX,
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvts_biasph_bf8() {
+        let b = _mm512_set1_m128h(BF8_INPUT);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_maskz_cvts_biasph_bf8(0b01010101010101010101010101010101, a, b);
+        let e = _mm256_set_epi8(
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX,
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    const HF8_M0: i8 = 0x80_u8 as i8;
+    const HF8_1: i8 = 0x38;
+    const HF8_1P125: i8 = 0x39;
+    const HF8_M1: i8 = 0xb8_u8 as i8;
+    const HF8_M1P125: i8 = 0xb9_u8 as i8;
+    const HF8_NAN: i8 = 0x7f;
+    const HF8_MAX: i8 = 0x7e;
+    const HF8_MIN: i8 = 0x08;
+    const HF8_MIN_DEN: i8 = 0x01;
+
+    const HF8_INPUT: __m128h =
+        unsafe { _mm_set_ph(-0.0, 1.0, 1.1, -1.05, 1.0625, -1.0625, 464.0, f16::INFINITY) };
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtbiasph_hf8() {
+        let b = HF8_INPUT;
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_cvtbiasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, HF8_M0, HF8_1, HF8_1, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(255);
+        let r = _mm_cvtbiasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, HF8_M0, HF8_1, HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125,
+            HF8_NAN, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(127);
+        let r = _mm_cvtbiasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX,
+            HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(128);
+        let r = _mm_cvtbiasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125,
+            HF8_NAN, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtbiasph_hf8() {
+        let b = HF8_INPUT;
+        let src = _mm_set1_epi8(0x7f);
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_mask_cvtbiasph_hf8(src, 0b01010101, a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtbiasph_hf8() {
+        let b = HF8_INPUT;
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_maskz_cvtbiasph_hf8(0b01010101, a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtbiasph_hf8() {
+        let b = _mm256_set1_m128h(HF8_INPUT);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_cvtbiasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1,
+            HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(255);
+        let r = _mm256_cvtbiasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN, HF8_M0,
+            HF8_1, HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(127);
+        let r = _mm256_cvtbiasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(128);
+        let r = _mm256_cvtbiasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN, HF8_M0,
+            HF8_1, HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtbiasph_hf8() {
+        let b = _mm256_set1_m128h(HF8_INPUT);
+        let src = _mm_set1_epi8(0x7f);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_mask_cvtbiasph_hf8(src, 0b0101010101010101, a, b);
+        let e = _mm_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtbiasph_hf8() {
+        let b = _mm256_set1_m128h(HF8_INPUT);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_maskz_cvtbiasph_hf8(0b0101010101010101, a, b);
+        let e = _mm_set_epi8(
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtbiasph_hf8() {
+        let b = _mm512_set1_m128h(HF8_INPUT);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_cvtbiasph_hf8(a, b);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1,
+            HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1, HF8_M1, HF8_1, HF8_M1,
+            HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+
+        let a = _mm512_set1_epi16(255);
+        let r = _mm512_cvtbiasph_hf8(a, b);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN, HF8_M0,
+            HF8_1, HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+
+        let a = _mm512_set1_epi16(127);
+        let r = _mm512_cvtbiasph_hf8(a, b);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1P125, HF8_M1,
+            HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1,
+            HF8_MAX, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+
+        let a = _mm512_set1_epi16(128);
+        let r = _mm512_cvtbiasph_hf8(a, b);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN, HF8_M0,
+            HF8_1, HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN, HF8_M0, HF8_1, HF8_1P125,
+            HF8_M1, HF8_1P125, HF8_M1P125, HF8_NAN, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtbiasph_hf8() {
+        let b = _mm512_set1_m128h(HF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_mask_cvtbiasph_hf8(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm256_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN,
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtbiasph_hf8() {
+        let b = _mm512_set1_m128h(HF8_INPUT);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_maskz_cvtbiasph_hf8(0b01010101010101010101010101010101, a, b);
+        let e = _mm256_set_epi8(
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN,
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvts_biasph_hf8() {
+        let b = HF8_INPUT;
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_cvts_biasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, HF8_M0, HF8_1, HF8_1, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(255);
+        let r = _mm_cvts_biasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, HF8_M0, HF8_1, HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125,
+            HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(127);
+        let r = _mm_cvts_biasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX,
+            HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm_set1_epi16(128);
+        let r = _mm_cvts_biasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125,
+            HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvts_biasph_hf8() {
+        let b = HF8_INPUT;
+        let src = _mm_set1_epi8(0x7f);
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_mask_cvts_biasph_hf8(src, 0b01010101, a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvts_biasph_hf8() {
+        let b = HF8_INPUT;
+
+        let a = _mm_set1_epi16(0);
+        let r = _mm_maskz_cvts_biasph_hf8(0b01010101, a, b);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvts_biasph_hf8() {
+        let b = _mm256_set1_m128h(HF8_INPUT);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_cvts_biasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1,
+            HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(255);
+        let r = _mm256_cvts_biasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX, HF8_M0,
+            HF8_1, HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(127);
+        let r = _mm256_cvts_biasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_set1_epi16(128);
+        let r = _mm256_cvts_biasph_hf8(a, b);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX, HF8_M0,
+            HF8_1, HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvts_biasph_hf8() {
+        let b = _mm256_set1_m128h(HF8_INPUT);
+        let src = _mm_set1_epi8(0x7f);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_mask_cvts_biasph_hf8(src, 0b0101010101010101, a, b);
+        let e = _mm_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvts_biasph_hf8() {
+        let b = _mm256_set1_m128h(HF8_INPUT);
+
+        let a = _mm256_set1_epi16(0);
+        let r = _mm256_maskz_cvts_biasph_hf8(0b0101010101010101, a, b);
+        let e = _mm_set_epi8(
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvts_biasph_hf8() {
+        let b = _mm512_set1_m128h(HF8_INPUT);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_cvts_biasph_hf8(a, b);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1,
+            HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1, HF8_M1, HF8_1, HF8_M1,
+            HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+
+        let a = _mm512_set1_epi16(255);
+        let r = _mm512_cvts_biasph_hf8(a, b);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX, HF8_M0,
+            HF8_1, HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1P125, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+
+        let a = _mm512_set1_epi16(127);
+        let r = _mm512_cvts_biasph_hf8(a, b);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125, HF8_M1,
+            HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1,
+            HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+
+        let a = _mm512_set1_epi16(128);
+        let r = _mm512_cvts_biasph_hf8(a, b);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX, HF8_M0,
+            HF8_1, HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125,
+            HF8_M1, HF8_1P125, HF8_M1P125, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvts_biasph_hf8() {
+        let b = _mm512_set1_m128h(HF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_mask_cvts_biasph_hf8(src, 0b01010101010101010101010101010101, a, b);
+        let e = _mm256_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX,
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvts_biasph_hf8() {
+        let b = _mm512_set1_m128h(HF8_INPUT);
+
+        let a = _mm512_set1_epi16(0);
+        let r = _mm512_maskz_cvts_biasph_hf8(0b01010101010101010101010101010101, a, b);
+        let e = _mm256_set_epi8(
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX,
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvt2ph_bf8() {
+        let r = _mm_cvt2ph_bf8(BF8_INPUT, BF8_INPUT);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvt2ph_bf8() {
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm_mask_cvt2ph_bf8(src, 0b0101010101010101, BF8_INPUT, BF8_INPUT);
+        let e = _mm_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvt2ph_bf8() {
+        let r = _mm_maskz_cvt2ph_bf8(0b0101010101010101, BF8_INPUT, BF8_INPUT);
+        let e = _mm_set_epi8(
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_INF, 0x00, BF8_1, 0x00, BF8_M1,
+            0x00, BF8_M1, 0x00, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvt2ph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let r = _mm256_cvt2ph_bf8(a, a);
+        let e = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1,
+            BF8_INF, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvt2ph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+        let r = _mm256_mask_cvt2ph_bf8(src, 0b01010101010101010101010101010101, a, a);
+        let e = _mm256_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF,
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvt2ph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let r = _mm256_maskz_cvt2ph_bf8(0b01010101010101010101010101010101, a, a);
+        let e = _mm256_set_epi8(
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_INF, 0x00, BF8_1, 0x00, BF8_M1,
+            0x00, BF8_M1, 0x00, BF8_INF, 0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_INF,
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvt2ph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let r = _mm512_cvt2ph_bf8(a, a);
+        let e = _mm512_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1,
+            BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF,
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_1, BF8_M1, BF8_INF, BF8_INF,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvt2ph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let src = _mm512_set1_epi8(0x7f);
+        let r = _mm512_mask_cvt2ph_bf8(
+            src,
+            0b0101010101010101010101010101010101010101010101010101010101010101,
+            a,
+            a,
+        );
+        let e = _mm512_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF,
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF,
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_INF,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvt2ph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let r = _mm512_maskz_cvt2ph_bf8(
+            0b0101010101010101010101010101010101010101010101010101010101010101,
+            a,
+            a,
+        );
+        let e = _mm512_set_epi8(
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_INF, 0x00, BF8_1, 0x00, BF8_M1,
+            0x00, BF8_M1, 0x00, BF8_INF, 0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_INF,
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_INF, 0x00, BF8_1, 0x00, BF8_M1,
+            0x00, BF8_M1, 0x00, BF8_INF, 0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_INF,
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_INF, 0x00, BF8_1, 0x00, BF8_M1,
+            0x00, BF8_M1, 0x00, BF8_INF,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvts_2ph_bf8() {
+        let r = _mm_cvts_2ph_bf8(BF8_INPUT, BF8_INPUT);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvts_2ph_bf8() {
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm_mask_cvts_2ph_bf8(src, 0b0101010101010101, BF8_INPUT, BF8_INPUT);
+        let e = _mm_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvts_2ph_bf8() {
+        let r = _mm_maskz_cvts_2ph_bf8(0b0101010101010101, BF8_INPUT, BF8_INPUT);
+        let e = _mm_set_epi8(
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_MAX, 0x00, BF8_1, 0x00, BF8_M1,
+            0x00, BF8_M1, 0x00, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvts_2ph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let r = _mm256_cvts_2ph_bf8(a, a);
+        let e = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1,
+            BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvts_2ph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+        let r = _mm256_mask_cvts_2ph_bf8(src, 0b01010101010101010101010101010101, a, a);
+        let e = _mm256_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX,
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvts_2ph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let r = _mm256_maskz_cvts_2ph_bf8(0b01010101010101010101010101010101, a, a);
+        let e = _mm256_set_epi8(
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_MAX, 0x00, BF8_1, 0x00, BF8_M1,
+            0x00, BF8_M1, 0x00, BF8_MAX, 0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_MAX,
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvts_2ph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let r = _mm512_cvts_2ph_bf8(a, a);
+        let e = _mm512_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1,
+            BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX,
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_1, BF8_M1, BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvts_2ph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let src = _mm512_set1_epi8(0x7f);
+        let r = _mm512_mask_cvts_2ph_bf8(
+            src,
+            0b0101010101010101010101010101010101010101010101010101010101010101,
+            a,
+            a,
+        );
+        let e = _mm512_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX,
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX,
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_MAX,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvts_2ph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let r = _mm512_maskz_cvts_2ph_bf8(
+            0b0101010101010101010101010101010101010101010101010101010101010101,
+            a,
+            a,
+        );
+        let e = _mm512_set_epi8(
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_MAX, 0x00, BF8_1, 0x00, BF8_M1,
+            0x00, BF8_M1, 0x00, BF8_MAX, 0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_MAX,
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_MAX, 0x00, BF8_1, 0x00, BF8_M1,
+            0x00, BF8_M1, 0x00, BF8_MAX, 0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_MAX,
+            0x00, BF8_1, 0x00, BF8_M1, 0x00, BF8_M1, 0x00, BF8_MAX, 0x00, BF8_1, 0x00, BF8_M1,
+            0x00, BF8_M1, 0x00, BF8_MAX,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvt2ph_hf8() {
+        let r = _mm_cvt2ph_hf8(HF8_INPUT, HF8_INPUT);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvt2ph_hf8() {
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm_mask_cvt2ph_hf8(src, 0b0101010101010101, HF8_INPUT, HF8_INPUT);
+        let e = _mm_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvt2ph_hf8() {
+        let r = _mm_maskz_cvt2ph_hf8(0b0101010101010101, HF8_INPUT, HF8_INPUT);
+        let e = _mm_set_epi8(
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_NAN, 0x00, HF8_1, 0x00, HF8_M1,
+            0x00, HF8_M1, 0x00, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvt2ph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let r = _mm256_cvt2ph_hf8(a, a);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1P125, HF8_M1,
+            HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1,
+            HF8_MAX, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvt2ph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+        let r = _mm256_mask_cvt2ph_hf8(src, 0b01010101010101010101010101010101, a, a);
+        let e = _mm256_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN,
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvt2ph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let r = _mm256_maskz_cvt2ph_hf8(0b01010101010101010101010101010101, a, a);
+        let e = _mm256_set_epi8(
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_NAN, 0x00, HF8_1, 0x00, HF8_M1,
+            0x00, HF8_M1, 0x00, HF8_NAN, 0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_NAN,
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvt2ph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let r = _mm512_cvt2ph_hf8(a, a);
+        let e = _mm512_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1P125, HF8_M1,
+            HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1,
+            HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN,
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1P125, HF8_M1,
+            HF8_1, HF8_M1, HF8_MAX, HF8_NAN,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvt2ph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let src = _mm512_set1_epi8(0x7f);
+        let r = _mm512_mask_cvt2ph_hf8(
+            src,
+            0b0101010101010101010101010101010101010101010101010101010101010101,
+            a,
+            a,
+        );
+        let e = _mm512_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN,
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN,
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_NAN,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvt2ph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let r = _mm512_maskz_cvt2ph_hf8(
+            0b0101010101010101010101010101010101010101010101010101010101010101,
+            a,
+            a,
+        );
+        let e = _mm512_set_epi8(
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_NAN, 0x00, HF8_1, 0x00, HF8_M1,
+            0x00, HF8_M1, 0x00, HF8_NAN, 0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_NAN,
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_NAN, 0x00, HF8_1, 0x00, HF8_M1,
+            0x00, HF8_M1, 0x00, HF8_NAN, 0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_NAN,
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_NAN, 0x00, HF8_1, 0x00, HF8_M1,
+            0x00, HF8_M1, 0x00, HF8_NAN,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvts_2ph_hf8() {
+        let r = _mm_cvts_2ph_hf8(HF8_INPUT, HF8_INPUT);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvts_2ph_hf8() {
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm_mask_cvts_2ph_hf8(src, 0b0101010101010101, HF8_INPUT, HF8_INPUT);
+        let e = _mm_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvts_2ph_hf8() {
+        let r = _mm_maskz_cvts_2ph_hf8(0b0101010101010101, HF8_INPUT, HF8_INPUT);
+        let e = _mm_set_epi8(
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_MAX, 0x00, HF8_1, 0x00, HF8_M1,
+            0x00, HF8_M1, 0x00, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvts_2ph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let r = _mm256_cvts_2ph_hf8(a, a);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125, HF8_M1,
+            HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1,
+            HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvts_2ph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+        let r = _mm256_mask_cvts_2ph_hf8(src, 0b01010101010101010101010101010101, a, a);
+        let e = _mm256_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX,
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvts_2ph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let r = _mm256_maskz_cvts_2ph_hf8(0b01010101010101010101010101010101, a, a);
+        let e = _mm256_set_epi8(
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_MAX, 0x00, HF8_1, 0x00, HF8_M1,
+            0x00, HF8_M1, 0x00, HF8_MAX, 0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_MAX,
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvts_2ph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let r = _mm512_cvts_2ph_hf8(a, a);
+        let e = _mm512_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125, HF8_M1,
+            HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1,
+            HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX,
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125, HF8_M1,
+            HF8_1, HF8_M1, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvts_2ph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let src = _mm512_set1_epi8(0x7f);
+        let r = _mm512_mask_cvts_2ph_hf8(
+            src,
+            0b0101010101010101010101010101010101010101010101010101010101010101,
+            a,
+            a,
+        );
+        let e = _mm512_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX,
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX,
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_MAX,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvts_2ph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let r = _mm512_maskz_cvts_2ph_hf8(
+            0b0101010101010101010101010101010101010101010101010101010101010101,
+            a,
+            a,
+        );
+        let e = _mm512_set_epi8(
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_MAX, 0x00, HF8_1, 0x00, HF8_M1,
+            0x00, HF8_M1, 0x00, HF8_MAX, 0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_MAX,
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_MAX, 0x00, HF8_1, 0x00, HF8_M1,
+            0x00, HF8_M1, 0x00, HF8_MAX, 0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_MAX,
+            0x00, HF8_1, 0x00, HF8_M1, 0x00, HF8_M1, 0x00, HF8_MAX, 0x00, HF8_1, 0x00, HF8_M1,
+            0x00, HF8_M1, 0x00, HF8_MAX,
+        );
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtph_bf8() {
+        let r = _mm_cvtph_bf8(BF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF,
+            BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtph_bf8() {
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm_mask_cvtph_bf8(src, 0b01010101, BF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtph_bf8() {
+        let r = _mm_maskz_cvtph_bf8(0b01010101, BF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let r = _mm256_cvtph_bf8(a);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm256_mask_cvtph_bf8(src, 0b0101010101010101, a);
+        let e = _mm_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let r = _mm256_maskz_cvtph_bf8(0b0101010101010101, a);
+        let e = _mm_set_epi8(
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let r = _mm512_cvtph_bf8(a);
+        let e = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_1, BF8_M1, BF8_INF, BF8_INF, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1,
+            BF8_INF, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+        let r = _mm512_mask_cvtph_bf8(src, 0b01010101010101010101010101010101, a);
+        let e = _mm256_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_INF, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF,
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let r = _mm512_maskz_cvtph_bf8(0b01010101010101010101010101010101, a);
+        let e = _mm256_set_epi8(
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF,
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_INF,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvts_ph_bf8() {
+        let r = _mm_cvts_ph_bf8(BF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX,
+            BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvts_ph_bf8() {
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm_mask_cvts_ph_bf8(src, 0b01010101, BF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvts_ph_bf8() {
+        let r = _mm_maskz_cvts_ph_bf8(0b01010101, BF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvts_ph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let r = _mm256_cvts_ph_bf8(a);
+        let e = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvts_ph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm256_mask_cvts_ph_bf8(src, 0b0101010101010101, a);
+        let e = _mm_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvts_ph_bf8() {
+        let a = _mm256_set1_m128h(BF8_INPUT);
+        let r = _mm256_maskz_cvts_ph_bf8(0b0101010101010101, a);
+        let e = _mm_set_epi8(
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvts_ph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let r = _mm512_cvts_ph_bf8(a);
+        let e = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_1, BF8_M1, BF8_MAX, BF8_MAX, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_1, BF8_M1,
+            BF8_MAX, BF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvts_ph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+        let r = _mm512_mask_cvts_ph_bf8(src, 0b01010101010101010101010101010101, a);
+        let e = _mm256_set_epi8(
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1,
+            0x7f, BF8_M1, 0x7f, BF8_MAX, 0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX,
+            0x7f, BF8_1, 0x7f, BF8_M1, 0x7f, BF8_M1, 0x7f, BF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvts_ph_bf8() {
+        let a = _mm512_set1_m128h(BF8_INPUT);
+        let r = _mm512_maskz_cvts_ph_bf8(0b01010101010101010101010101010101, a);
+        let e = _mm256_set_epi8(
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX,
+            0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX, 0, BF8_1, 0, BF8_M1, 0, BF8_M1, 0, BF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtph_hf8() {
+        let r = _mm_cvtph_hf8(HF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX,
+            HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtph_hf8() {
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm_mask_cvtph_hf8(src, 0b01010101, HF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtph_hf8() {
+        let r = _mm_maskz_cvtph_hf8(0b01010101, HF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let r = _mm256_cvtph_hf8(a);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm256_mask_cvtph_hf8(src, 0b0101010101010101, a);
+        let e = _mm_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let r = _mm256_maskz_cvtph_hf8(0b0101010101010101, a);
+        let e = _mm_set_epi8(
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let r = _mm512_cvtph_hf8(a);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1P125, HF8_M1,
+            HF8_1, HF8_M1, HF8_MAX, HF8_NAN, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1,
+            HF8_MAX, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+        let r = _mm512_mask_cvtph_hf8(src, 0b01010101010101010101010101010101, a);
+        let e = _mm256_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_NAN, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN,
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let r = _mm512_maskz_cvtph_hf8(0b01010101010101010101010101010101, a);
+        let e = _mm256_set_epi8(
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN,
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_NAN,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvts_ph_hf8() {
+        let r = _mm_cvts_ph_hf8(HF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX,
+            HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvts_ph_hf8() {
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm_mask_cvts_ph_hf8(src, 0b01010101, HF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvts_ph_hf8() {
+        let r = _mm_maskz_cvts_ph_hf8(0b01010101, HF8_INPUT);
+        let e = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvts_ph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let r = _mm256_cvts_ph_hf8(a);
+        let e = _mm_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvts_ph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let src = _mm_set1_epi8(0x7f);
+        let r = _mm256_mask_cvts_ph_hf8(src, 0b0101010101010101, a);
+        let e = _mm_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvts_ph_hf8() {
+        let a = _mm256_set1_m128h(HF8_INPUT);
+        let r = _mm256_maskz_cvts_ph_hf8(0b0101010101010101, a);
+        let e = _mm_set_epi8(
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX,
+        );
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvts_ph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let r = _mm512_cvts_ph_hf8(a);
+        let e = _mm256_set_epi8(
+            HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1,
+            HF8_1P125, HF8_M1, HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125, HF8_M1,
+            HF8_1, HF8_M1, HF8_MAX, HF8_MAX, HF8_M0, HF8_1, HF8_1P125, HF8_M1, HF8_1, HF8_M1,
+            HF8_MAX, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvts_ph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let src = _mm256_set1_epi8(0x7f);
+        let r = _mm512_mask_cvts_ph_hf8(src, 0b01010101010101010101010101010101, a);
+        let e = _mm256_set_epi8(
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1,
+            0x7f, HF8_M1, 0x7f, HF8_MAX, 0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX,
+            0x7f, HF8_1, 0x7f, HF8_M1, 0x7f, HF8_M1, 0x7f, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvts_ph_hf8() {
+        let a = _mm512_set1_m128h(HF8_INPUT);
+        let r = _mm512_maskz_cvts_ph_hf8(0b01010101010101010101010101010101, a);
+        let e = _mm256_set_epi8(
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX,
+            0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX, 0, HF8_1, 0, HF8_M1, 0, HF8_M1, 0, HF8_MAX,
+        );
+        assert_eq_m256i(r, e);
+    }
+
+    const BF8_OUTPUT: __m128h = unsafe {
+        _mm_set_ph(
+            -0.0,
+            1.0,
+            1.25,
+            -1.0,
+            -1.25,
+            f16::INFINITY,
+            57344.0,
+            0.00006103515625,
+        )
+    };
+
+    const HF8_OUTPUT: __m128h =
+        unsafe { _mm_set_ph(-0.0, 1.0, 1.125, -1.0, -1.125, 0.001953125, 448.0, 0.015625) };
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtbf8_ph() {
+        let a = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX,
+            BF8_MIN,
+        );
+        let r = _mm_cvtbf8_ph(a);
+        let e = BF8_OUTPUT;
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvtbf8_ph() {
+        let a = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX,
+            BF8_MIN,
+        );
+        let src = _mm_set1_ph(42.0);
+        let k = 0b01010101;
+        let r = _mm_mask_cvtbf8_ph(src, k, a);
+        let e = _mm_mask_mov_ph(src, k, BF8_OUTPUT);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvtbf8_ph() {
+        let a = _mm_set_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX,
+            BF8_MIN,
+        );
+        let k = 0b01010101;
+        let r = _mm_maskz_cvtbf8_ph(k, a);
+        let e = _mm_maskz_mov_ph(k, BF8_OUTPUT);
+        assert_eq_m128h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvtbf8_ph() {
+        let a = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN,
+        );
+        let r = _mm256_cvtbf8_ph(a);
+        let e = _mm256_set1_m128h(BF8_OUTPUT);
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvtbf8_ph() {
+        let a = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN,
+        );
+        let src = _mm256_set1_ph(42.0);
+        let k = 0b0101010101010101;
+        let r = _mm256_mask_cvtbf8_ph(src, k, a);
+        let e = _mm256_mask_mov_ph(src, k, _mm256_set1_m128h(BF8_OUTPUT));
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvtbf8_ph() {
+        let a = _mm_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN,
+        );
+        let k = 0b0101010101010101;
+        let r = _mm256_maskz_cvtbf8_ph(k, a);
+        let e = _mm256_maskz_mov_ph(k, _mm256_set1_m128h(BF8_OUTPUT));
+        assert_eq_m256h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvtbf8_ph() {
+        let a = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1, BF8_1P25,
+            BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN,
+        );
+        let r = _mm512_cvtbf8_ph(a);
+        let e = _mm512_set1_m128h(BF8_OUTPUT);
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvtbf8_ph() {
+        let a = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1, BF8_1P25,
+            BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN,
+        );
+        let src = _mm512_set1_ph(42.0);
+        let k = 0b01010101010101010101010101010101;
+        let r = _mm512_mask_cvtbf8_ph(src, k, a);
+        let e = _mm512_mask_mov_ph(src, k, _mm512_set1_m128h(BF8_OUTPUT));
+        assert_eq_m512h(r, e);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvtbf8_ph() {
+        let a = _mm256_set_epi8(
+            BF8_M0, BF8_1, BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1,
+            BF8_1P25, BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1, BF8_1P25,
+            BF8_M1, BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN, BF8_M0, BF8_1, BF8_1P25, BF8_M1,
+            BF8_M1P25, BF8_INF, BF8_MAX, BF8_MIN,
+        );
+        let k = 0b01010101010101010101010101010101;
+        let r = _mm512_maskz_cvtbf8_ph(k, a);
+        let e = _mm512_maskz_mov_ph(k, _mm512_set1_m128h(BF8_OUTPUT));
+        assert_eq_m512h(r, e);
+    }
+
+    // FIXME: the following tests do not pass due to a LLVM miscompilation bug. See llvm/llvm-project#184651
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvthf8_ph() {
+        let a = _mm_set_epi8(
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+        );
+        let r = _mm_cvthf8_ph(a);
+        let e = HF8_OUTPUT;
+        assert_eq_m128h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_mask_cvthf8_ph() {
+        let a = _mm_set_epi8(
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+        );
+        let src = _mm_set1_ph(42.0);
+        let k = 0b01010101;
+        let r = _mm_mask_cvthf8_ph(src, k, a);
+        let e = _mm_mask_mov_ph(src, k, HF8_OUTPUT);
+        assert_eq_m128h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_maskz_cvthf8_ph() {
+        let a = _mm_set_epi8(
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+        );
+        let k = 0b01010101;
+        let r = _mm_maskz_cvthf8_ph(k, a);
+        let e = _mm_maskz_mov_ph(k, HF8_OUTPUT);
+        assert_eq_m128h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_cvthf8_ph() {
+        let a = _mm_set_epi8(
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+        );
+        let r = _mm256_cvthf8_ph(a);
+        let e = _mm256_set1_m128h(HF8_OUTPUT);
+        assert_eq_m256h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_mask_cvthf8_ph() {
+        let a = _mm_set_epi8(
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+        );
+        let src = _mm256_set1_ph(42.0);
+        let k = 0b0101010101010101;
+        let r = _mm256_mask_cvthf8_ph(src, k, a);
+        let e = _mm256_mask_mov_ph(src, k, _mm256_set1_m128h(HF8_OUTPUT));
+        assert_eq_m256h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm256_maskz_cvthf8_ph() {
+        let a = _mm_set_epi8(
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+        );
+        let k = 0b0101010101010101;
+        let r = _mm256_maskz_cvthf8_ph(k, a);
+        let e = _mm256_maskz_mov_ph(k, _mm256_set1_m128h(HF8_OUTPUT));
+        assert_eq_m256h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_cvthf8_ph() {
+        let a = _mm256_set_epi8(
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+        );
+        let r = _mm512_cvthf8_ph(a);
+        let e = _mm512_set1_m128h(HF8_OUTPUT);
+        assert_eq_m512h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_mask_cvthf8_ph() {
+        let a = _mm256_set_epi8(
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+        );
+        let src = _mm512_set1_ph(42.0);
+        let k = 0b01010101010101010101010101010101;
+        let r = _mm512_mask_cvthf8_ph(src, k, a);
+        let e = _mm512_mask_mov_ph(src, k, _mm512_set1_m128h(HF8_OUTPUT));
+        assert_eq_m512h(r, e);
+    }
+
+    #[ignore]
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm512_maskz_cvthf8_ph() {
+        let a = _mm256_set_epi8(
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+            HF8_M0,
+            HF8_1,
+            HF8_1P125,
+            HF8_M1,
+            HF8_M1P125,
+            HF8_MIN_DEN,
+            HF8_MAX,
+            HF8_MIN,
+        );
+        let k = 0b01010101010101010101010101010101;
+        let r = _mm512_maskz_cvthf8_ph(k, a);
+        let e = _mm512_maskz_mov_ph(k, _mm512_set1_m128h(HF8_OUTPUT));
+        assert_eq_m512h(r, e);
+    }
+}
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 9396507f08..9a5d18e59e 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -774,3 +774,7 @@ pub use self::avx512fp16::*;
 mod kl;
 #[stable(feature = "keylocker_x86", since = "1.89.0")]
 pub use self::kl::*;
+
+mod avx10_2;
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+pub use self::avx10_2::*;
diff --git a/crates/core_arch/src/x86/sha.rs b/crates/core_arch/src/x86/sha.rs
index f8a3295d19..f30bd6f230 100644
--- a/crates/core_arch/src/x86/sha.rs
+++ b/crates/core_arch/src/x86/sha.rs
@@ -32,10 +32,14 @@ unsafe extern "C" {
     fn vsm4key4128(a: i32x4, b: i32x4) -> i32x4;
     #[link_name = "llvm.x86.vsm4key4256"]
     fn vsm4key4256(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.vsm4key4512"]
+    fn vsm4key4512(a: i32x16, b: i32x16) -> i32x16;
     #[link_name = "llvm.x86.vsm4rnds4128"]
     fn vsm4rnds4128(a: i32x4, b: i32x4) -> i32x4;
     #[link_name = "llvm.x86.vsm4rnds4256"]
     fn vsm4rnds4256(a: i32x8, b: i32x8) -> i32x8;
+    #[link_name = "llvm.x86.vsm4rnds4512"]
+    fn vsm4rnds4512(a: i32x16, b: i32x16) -> i32x16;
 }
 
 #[cfg(test)]
@@ -252,6 +256,16 @@ pub fn _mm256_sm4key4_epi32(a: __m256i, b: __m256i) -> __m256i {
     unsafe { transmute(vsm4key4256(a.as_i32x8(), b.as_i32x8())) }
 }
 
+/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent
+/// 128-bit lanes. The calculated results are stored in dst.
+#[inline]
+#[target_feature(enable = "sm4,avx10.2")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vsm4key4))]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+pub fn _mm512_sm4key4_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vsm4key4512(a.as_i32x16(), b.as_i32x16()).as_m512i() }
+}
+
 /// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
 /// 128-bit lanes. The calculated results are stored in dst.
 ///
@@ -276,6 +290,16 @@ pub fn _mm256_sm4rnds4_epi32(a: __m256i, b: __m256i) -> __m256i {
     unsafe { transmute(vsm4rnds4256(a.as_i32x8(), b.as_i32x8())) }
 }
 
+/// This intrinsic performs four rounds of SM4 encryption. The intrinsic operates on independent
+/// 128-bit lanes. The calculated results are stored in dst.
+#[inline]
+#[target_feature(enable = "sm4,avx10.2")]
+#[cfg_attr(all(test, not(target_vendor = "apple")), assert_instr(vsm4rnds4))]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+pub fn _mm512_sm4rnds4_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { vsm4rnds4512(a.as_i32x16(), b.as_i32x16()).as_m512i() }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::{
@@ -475,10 +499,12 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
-    static DATA_32: [u32; 16] = [
+    static DATA_32: [u32; 32] = [
         0x00112233, 0x44556677, 0x8899aabb, 0xccddeeff, 0xffeeddcc, 0xbbaa9988, 0x77665544,
         0x33221100, 0x01234567, 0x89abcdef, 0xfedcba98, 0x76543210, 0x02468ace, 0x13579bdf,
-        0xfdb97531, 0xeca86420,
+        0xfdb97531, 0xeca86420, 0x048c159d, 0x26ae37bf, 0xfb73ea62, 0xd951c840, 0xabcdef01,
+        0x23456789, 0x0fedcba9, 0x87654321, 0x10fedcba, 0x98765432, 0x1fdb9753, 0xeca86420,
+        0x048c159d, 0x26ae37bf, 0xfb73ea62, 0xd951c840,
     ];
 
     #[simd_test(enable = "sm3,avx")]
@@ -685,6 +711,31 @@ mod tests {
         assert_eq_m256i(r, e);
     }
 
+    #[inline]
+    #[target_feature(enable = "avx512f")]
+    fn _mm512_set_m256i(lo: __m256i, hi: __m256i) -> __m512i {
+        unsafe { simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) }
+    }
+
+    #[simd_test(enable = "sm4,avx10.2")]
+    fn test_mm512_sm4key4_epi32() {
+        let a_low = unsafe { _mm256_loadu_si256(DATA_32.as_ptr().cast()) };
+        let a_high = unsafe { _mm256_loadu_si256(DATA_32[8..].as_ptr().cast()) };
+        let b_low = unsafe { _mm256_loadu_si256(DATA_32[16..].as_ptr().cast()) };
+        let b_high = unsafe { _mm256_loadu_si256(DATA_32[24..].as_ptr().cast()) };
+
+        let a = _mm512_set_m256i(a_high, a_low);
+        let b = _mm512_set_m256i(b_high, b_low);
+
+        let r = _mm512_sm4key4_epi32(a, b);
+
+        let e_low = _mm256_sm4key4_epi32(a_low, b_low);
+        let e_high = _mm256_sm4key4_epi32(a_high, b_high);
+        let e = _mm512_set_m256i(e_high, e_low);
+
+        assert_eq_m512i(r, e);
+    }
+
     #[simd_test(enable = "sm4,avx")]
     fn test_mm_sm4rnds4_epi32() {
         fn l_rnd(x: u32) -> u32 {
@@ -729,4 +780,23 @@ mod tests {
 
         assert_eq_m256i(r, e);
     }
+
+    #[simd_test(enable = "sm4,avx10.2")]
+    fn test_mm512_sm4rnds4_epi32() {
+        let a_low = unsafe { _mm256_loadu_si256(DATA_32.as_ptr().cast()) };
+        let a_high = unsafe { _mm256_loadu_si256(DATA_32[8..].as_ptr().cast()) };
+        let b_low = unsafe { _mm256_loadu_si256(DATA_32[16..].as_ptr().cast()) };
+        let b_high = unsafe { _mm256_loadu_si256(DATA_32[24..].as_ptr().cast()) };
+
+        let a = _mm512_set_m256i(a_high, a_low);
+        let b = _mm512_set_m256i(b_high, b_low);
+
+        let r = _mm512_sm4rnds4_epi32(a, b);
+
+        let e_low = _mm256_sm4rnds4_epi32(a_low, b_low);
+        let e_high = _mm256_sm4rnds4_epi32(a_high, b_high);
+        let e = _mm512_set_m256i(e_high, e_low);
+
+        assert_eq_m512i(r, e);
+    }
 }
diff --git a/crates/core_arch/src/x86_64/avx10_2.rs b/crates/core_arch/src/x86_64/avx10_2.rs
new file mode 100644
index 0000000000..b2f9efe974
--- /dev/null
+++ b/crates/core_arch/src/x86_64/avx10_2.rs
@@ -0,0 +1,145 @@
+use crate::core_arch::{simd::*, x86::*};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit
+/// integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttsd2sis, SAE = 8)
+)]
+pub fn _mm_cvtts_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
+    static_assert_sae!(SAE);
+    unsafe { vcvttsd2sis64(a.as_f64x2(), SAE) }
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit
+/// integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+pub fn _mm_cvtts_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
+    _mm_cvtts_roundsd_i64::<SAE>(a)
+}
+
+/// Convert the lower double-precision (64-bit) floating-point element in `a` to a 64-bit
+/// unsigned integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttsd2usis, SAE = 8)
+)]
+pub fn _mm_cvtts_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
+    static_assert_sae!(SAE);
+    unsafe { vcvttsd2usis64(a.as_f64x2(), SAE) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit
+/// integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttss2sis, SAE = 8)
+)]
+pub fn _mm_cvtts_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
+    static_assert_sae!(SAE);
+    unsafe { vcvttss2sis64(a.as_f32x4(), SAE) }
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit
+/// integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+pub fn _mm_cvtts_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
+    _mm_cvtts_roundss_i64::<SAE>(a)
+}
+
+/// Convert the lower single-precision (32-bit) floating-point element in `a` to a 64-bit
+/// unsigned integer with truncation and saturation.
+/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`]` in the SAE parameter.
+#[inline]
+#[target_feature(enable = "avx10.2")]
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+#[cfg_attr(
+    all(test, not(target_vendor = "apple")),
+    assert_instr(vcvttss2usis, SAE = 8)
+)]
+pub fn _mm_cvtts_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
+    static_assert_sae!(SAE);
+    unsafe { vcvttss2usis64(a.as_f32x4(), SAE) }
+}
+
+#[allow(improper_ctypes)]
+unsafe extern "unadjusted" {
+    #[link_name = "llvm.x86.avx10.vcvttss2sis64"]
+    fn vcvttss2sis64(a: f32x4, sae: i32) -> i64;
+    #[link_name = "llvm.x86.avx10.vcvttss2usis64"]
+    fn vcvttss2usis64(a: f32x4, sae: i32) -> u64;
+
+    #[link_name = "llvm.x86.avx10.vcvttsd2sis64"]
+    fn vcvttsd2sis64(a: f64x2, sae: i32) -> i64;
+    #[link_name = "llvm.x86.avx10.vcvttsd2usis64"]
+    fn vcvttsd2usis64(a: f64x2, sae: i32) -> u64;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundsd_i64() {
+        let a = _mm_set_sd(8.5);
+        let r = _mm_cvtts_roundsd_i64::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 8i64);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundsd_si64() {
+        let a = _mm_set_sd(9.3);
+        let r = _mm_cvtts_roundsd_si64::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 9i64);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundsd_u64() {
+        let a = _mm_set_sd(10.7);
+        let r = _mm_cvtts_roundsd_u64::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 10u64);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundss_i64() {
+        let a = _mm_set_ss(11.4);
+        let r = _mm_cvtts_roundss_i64::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 11i64);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundss_si64() {
+        let a = _mm_set_ss(12.9);
+        let r = _mm_cvtts_roundss_si64::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 12i64);
+    }
+
+    #[simd_test(enable = "avx10.2")]
+    fn test_mm_cvtts_roundss_u64() {
+        let a = _mm_set_ss(13.6);
+        let r = _mm_cvtts_roundss_u64::<_MM_FROUND_NO_EXC>(a);
+        assert_eq!(r, 13u64);
+    }
+}
diff --git a/crates/core_arch/src/x86_64/mod.rs b/crates/core_arch/src/x86_64/mod.rs
index 9caab44e46..10b608f2ab 100644
--- a/crates/core_arch/src/x86_64/mod.rs
+++ b/crates/core_arch/src/x86_64/mod.rs
@@ -81,3 +81,7 @@ pub use self::avx512fp16::*;
 mod amx;
 #[unstable(feature = "x86_amx_intrinsics", issue = "126622")]
 pub use self::amx::*;
+
+mod avx10_2;
+#[unstable(feature = "stdarch_x86_avx10_2", issue = "153417")]
+pub use self::avx10_2::*;
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index 2ac05e28cb..eee387337c 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -305,7 +305,7 @@ fn verify_all_signatures() {
             }
 
             // FIXME: these have not been added to Intrinsics Guide yet
-            if ["amx-avx512", "amx-fp8", "amx-movrs", "amx-tf32"]
+            if ["amx-avx512", "amx-fp8", "amx-movrs", "amx-tf32", "avx10.2"]
                 .iter()
                 .any(|f| feature.contains(f))
             {