From 2e86321219d33b7f9a7a0cf323cf55740a876ebe Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 01:28:36 +0000
Subject: [PATCH 001/123] cvt_roundps,pd_epi32,epu32; cvt_roundepi32,epu32_ps;
 cvt_roundpd_ps; mm_add,sub,mul,div_round_ss,sd; mm_sqrt_round_ss,sd;
 mm_scalf_round_ss,sd; mm_fmadd,fmsub,fnmadd,fnmsub_round_ss,sd;
 mm_cvt_roundss_i32,u32; mm_cvt_roundsd_i32,u32; mm_cvt_roundi32,u32_ss;
 mm_cvt_roundsd_ss

---
 crates/core_arch/src/x86/avx512f.rs    | 2186 ++++++++++--------------
 crates/core_arch/src/x86_64/avx512f.rs |   30 +-
 2 files changed, 878 insertions(+), 1338 deletions(-)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7bf8bdeae9..7911157eb2 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -13393,17 +13393,13 @@ pub unsafe fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_epi32&expand=1335)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2dq, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundps_epi32(a: __m512, rounding: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_epi32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2dq(a, zero, 0b11111111_11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2dq(a, zero, 0b11111111_11111111, ROUNDING);
     transmute(r)
 }
 
@@ -13419,22 +13415,17 @@ pub unsafe fn _mm512_cvt_roundps_epi32(a: __m512, rounding: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2dq, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_epi32(
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_epi32<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
-    rounding: i32,
 ) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let src = src.as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2dq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2dq(a, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -13450,17 +13441,16 @@ pub unsafe fn _mm512_mask_cvt_roundps_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2dq, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_epi32(k: __mmask16, a: __m512, rounding: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_epi32<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2dq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2dq(a, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -13476,17 +13466,13 @@ pub unsafe fn _mm512_maskz_cvt_roundps_epi32(k: __mmask16, a: __m512, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_epu32&expand=1341)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2udq, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundps_epu32(a: __m512, rounding: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_epu32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2udq(a, zero, 0b11111111_11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2udq(a, zero, 0b11111111_11111111, ROUNDING);
     transmute(r)
 }
 
@@ -13502,22 +13488,17 @@ pub unsafe fn _mm512_cvt_roundps_epu32(a: __m512, rounding: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epu32&expand=1342)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2udq, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_epu32(
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_epu32<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
-    rounding: i32,
 ) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let src = src.as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2udq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2udq(a, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -13533,17 +13514,16 @@ pub unsafe fn _mm512_mask_cvt_roundps_epu32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epu32&expand=1343)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2udq, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_epu32(k: __mmask16, a: __m512, rounding: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2udq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2udq(a, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -13624,17 +13604,13 @@ pub unsafe fn _mm512_maskz_cvt_roundps_pd(k: __mmask8, a: __m256, sae: i32) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_epi32&expand=1315)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundpd_epi32(a: __m512d, rounding: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundpd_epi32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2dq(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2dq(a, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -13650,22 +13626,17 @@ pub unsafe fn _mm512_cvt_roundpd_epi32(a: __m512d, rounding: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_epi32&expand=1316)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundpd_epi32(
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundpd_epi32<const ROUNDING: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
-    rounding: i32,
 ) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let src = src.as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2dq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2dq(a, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -13681,17 +13652,16 @@ pub unsafe fn _mm512_mask_cvt_roundpd_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epi32&expand=1317)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundpd_epi32(k: __mmask8, a: __m512d, rounding: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundpd_epi32<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2dq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2dq(a, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -13707,17 +13677,13 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_epi32(k: __mmask8, a: __m512d, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_epu32&expand=1321)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundpd_epu32(a: __m512d, rounding: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundpd_epu32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_u32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2udq(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2udq(a, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -13733,22 +13699,17 @@ pub unsafe fn _mm512_cvt_roundpd_epu32(a: __m512d, rounding: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_epu32&expand=1322)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundpd_epu32(
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundpd_epu32<const ROUNDING: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
-    rounding: i32,
 ) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let src = src.as_u32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2udq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2udq(a, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -13764,17 +13725,16 @@ pub unsafe fn _mm512_mask_cvt_roundpd_epu32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epu32&expand=1323)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundpd_epu32(k: __mmask8, a: __m512d, rounding: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundpd_epu32<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_u32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2udq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2udq(a, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -13790,17 +13750,13 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_epu32(k: __mmask8, a: __m512d, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ps&expand=1327)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundpd_ps(a: __m512d, rounding: i32) -> __m256 {
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundpd_ps<const ROUNDING: i32>(a: __m512d) -> __m256 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_ps().as_f32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2ps(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2ps(a, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -13816,22 +13772,17 @@ pub unsafe fn _mm512_cvt_roundpd_ps(a: __m512d, rounding: i32) -> __m256 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ps&expand=1328)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundpd_ps(
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundpd_ps<const ROUNDING: i32>(
     src: __m256,
     k: __mmask8,
     a: __m512d,
-    rounding: i32,
 ) -> __m256 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let src = src.as_f32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2ps(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2ps(a, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -13847,17 +13798,13 @@ pub unsafe fn _mm512_mask_cvt_roundpd_ps(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ps&expand=1329)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundpd_ps(k: __mmask8, a: __m512d, rounding: i32) -> __m256 {
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundpd_ps<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_ps().as_f32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2ps(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2ps(a, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -13873,16 +13820,12 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_ps(k: __mmask8, a: __m512d, rounding: i32
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ps&expand=1294)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundepi32_ps(a: __m512i, rounding: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundepi32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtdq2ps(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtdq2ps(a, ROUNDING);
     transmute(r)
 }
 
@@ -13898,21 +13841,16 @@ pub unsafe fn _mm512_cvt_roundepi32_ps(a: __m512i, rounding: i32) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ps&expand=1295)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundepi32_ps(
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundepi32_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512i,
-    rounding: i32,
 ) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtdq2ps(a, $imm4)
-        };
-    }
-    let r: f32x16 = constify_imm4_round!(rounding, call);
+    let r = vcvtdq2ps(a, ROUNDING);
     transmute(simd_select_bitmask(k, r, src.as_f32x16()))
 }
 
@@ -13928,16 +13866,15 @@ pub unsafe fn _mm512_mask_cvt_roundepi32_ps(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ps&expand=1296)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundepi32_ps(k: __mmask16, a: __m512i, rounding: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundepi32_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtdq2ps(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtdq2ps(a, ROUNDING);
     let zero = _mm512_setzero_ps().as_f32x16();
     transmute(simd_select_bitmask(k, r, zero))
 }
@@ -13954,16 +13891,12 @@ pub unsafe fn _mm512_maskz_cvt_roundepi32_ps(k: __mmask16, a: __m512i, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ps&expand=1303)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundepu32_ps(a: __m512i, rounding: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundepu32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtudq2ps(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtudq2ps(a, ROUNDING);
     transmute(r)
 }
 
@@ -13979,21 +13912,16 @@ pub unsafe fn _mm512_cvt_roundepu32_ps(a: __m512i, rounding: i32) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ps&expand=1304)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundepu32_ps(
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundepu32_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512i,
-    rounding: i32,
 ) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtudq2ps(a, $imm4)
-        };
-    }
-    let r: f32x16 = constify_imm4_round!(rounding, call);
+    let r = vcvtudq2ps(a, ROUNDING);
     transmute(simd_select_bitmask(k, r, src.as_f32x16()))
 }
 
@@ -14009,16 +13937,15 @@ pub unsafe fn _mm512_mask_cvt_roundepu32_ps(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ps&expand=1305)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundepu32_ps(k: __mmask16, a: __m512i, rounding: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtudq2ps(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtudq2ps(a, ROUNDING);
     let zero = _mm512_setzero_ps().as_f32x16();
     transmute(simd_select_bitmask(k, r, zero))
 }
@@ -33519,18 +33446,15 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33545,24 +33469,20 @@ pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_add_round_ss(
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_add_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddss(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33577,18 +33497,19 @@ pub unsafe fn _mm_mask_add_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_add_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddss(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -33603,18 +33524,15 @@ pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -33629,24 +33547,20 @@ pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_add_round_sd(
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_add_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddsd(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -33661,23 +33575,19 @@ pub unsafe fn _mm_mask_add_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_add_round_sd(
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_add_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddsd(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33692,18 +33602,15 @@ pub unsafe fn _mm_maskz_add_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33718,24 +33625,20 @@ pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sub_round_ss(
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubss(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33750,18 +33653,19 @@ pub unsafe fn _mm_mask_sub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubss(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -33776,18 +33680,15 @@ pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -33802,24 +33703,20 @@ pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sub_round_sd(
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubsd(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -33834,23 +33731,19 @@ pub unsafe fn _mm_mask_sub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sub_round_sd(
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubsd(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33865,18 +33758,15 @@ pub unsafe fn _mm_maskz_sub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33891,24 +33781,20 @@ pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_mul_round_ss(
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulss(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33923,18 +33809,19 @@ pub unsafe fn _mm_mask_mul_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulss(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -33949,18 +33836,15 @@ pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -33975,24 +33859,20 @@ pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_mul_round_sd(
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulsd(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34007,23 +33887,19 @@ pub unsafe fn _mm_mask_mul_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_mul_round_sd(
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulsd(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34038,18 +33914,15 @@ pub unsafe fn _mm_maskz_mul_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34064,24 +33937,20 @@ pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_div_round_ss(
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_div_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivss(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34096,18 +33965,19 @@ pub unsafe fn _mm_mask_div_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_div_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivss(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -34122,18 +33992,15 @@ pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34148,24 +34015,20 @@ pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_div_round_sd(
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_div_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivsd(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34180,23 +34043,19 @@ pub unsafe fn _mm_mask_div_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_div_round_sd(
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_div_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivsd(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34475,18 +34334,15 @@ pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34501,24 +34357,20 @@ pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sqrt_round_ss(
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtss(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34533,18 +34385,19 @@ pub unsafe fn _mm_mask_sqrt_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtss(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -34559,18 +34412,15 @@ pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, roundin
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34585,24 +34435,20 @@ pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sqrt_round_sd(
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtsd(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34617,23 +34463,19 @@ pub unsafe fn _mm_mask_sqrt_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sqrt_round_sd(
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtsd(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
@@ -35196,18 +35038,14 @@ pub unsafe fn _mm_maskz_roundscale_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_ss&expand=4895)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_scalef_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_scalef_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefss(a, b, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefss(a, b, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -35223,24 +35061,19 @@ pub unsafe fn _mm_scalef_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_ss&expand=4893)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_scalef_round_ss(
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_scalef_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefss(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefss(a, b, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -35256,23 +35089,18 @@ pub unsafe fn _mm_mask_scalef_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_ss&expand=4894)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_scalef_round_ss(
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_scalef_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefss(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefss(a, b, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -35288,18 +35116,14 @@ pub unsafe fn _mm_maskz_scalef_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_sd&expand=4892)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_scalef_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_scalef_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefsd(a, b, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefsd(a, b, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -35315,24 +35139,18 @@ pub unsafe fn _mm_scalef_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m1
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_sd&expand=4890)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_scalef_round_sd(
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_scalef_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefsd(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefsd(a, b, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -35348,23 +35166,18 @@ pub unsafe fn _mm_mask_scalef_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_sd&expand=4891)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_scalef_round_sd(
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefsd(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefsd(a, b, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -35380,19 +35193,15 @@ pub unsafe fn _mm_maskz_scalef_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_ss&expand=2573)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let extracta: f32 = simd_extract(a, 0);
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132ss(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fmadd = constify_imm4_round!(rounding, call);
-    let r = simd_insert(a, 0, fmadd);
+    let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    let r = simd_insert(a, 0, r);
     transmute(r)
 }
 
@@ -35408,25 +35217,20 @@ pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_ss&expand=2574)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f32 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(fmadd, extractb, extractc, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmadd);
     transmute(r)
@@ -35444,26 +35248,21 @@ pub unsafe fn _mm_mask_fmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_ss&expand=2576)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmadd);
     transmute(r)
@@ -35481,25 +35280,20 @@ pub unsafe fn _mm_maskz_fmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_ss&expand=2575)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f32 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, fmadd, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING);
     }
     let r = simd_insert(c, 0, fmadd);
     transmute(r)
@@ -35517,18 +35311,18 @@ pub unsafe fn _mm_mask3_fmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_sd&expand=2569)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let extracta: f64 = simd_extract(a, 0);
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132sd(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fmadd = constify_imm4_round!(rounding, call);
+    let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fmadd);
     transmute(r)
 }
@@ -35545,25 +35339,20 @@ pub unsafe fn _mm_fmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_sd&expand=2570)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f64 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(fmadd, extractb, extractc, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmadd);
     transmute(r)
@@ -35581,26 +35370,21 @@ pub unsafe fn _mm_mask_fmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_sd&expand=2572)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmadd);
     transmute(r)
@@ -35618,25 +35402,20 @@ pub unsafe fn _mm_maskz_fmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_Sd&expand=2571)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f64 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, fmadd, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING);
     }
     let r = simd_insert(c, 0, fmadd);
     transmute(r)
@@ -35654,19 +35433,15 @@ pub unsafe fn _mm_mask3_fmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmsub_round_ss&expand=2659)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fmsub_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let extracta: f32 = simd_extract(a, 0);
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
     let extractc = -extractc;
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132ss(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fmsub = constify_imm4_round!(rounding, call);
+    let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
 }
@@ -35683,26 +35458,21 @@ pub unsafe fn _mm_fmsub_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_round_ss&expand=2660)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f32 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(fmsub, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
@@ -35720,27 +35490,22 @@ pub unsafe fn _mm_mask_fmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_round_ss&expand=2662)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
@@ -35758,26 +35523,21 @@ pub unsafe fn _mm_maskz_fmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_round_ss&expand=2661)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f32 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
         let extractc = -fmsub;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(c, 0, fmsub);
     transmute(r)
@@ -35795,19 +35555,19 @@ pub unsafe fn _mm_mask3_fmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmsub_round_sd&expand=2655)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fmsub_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let extracta: f64 = simd_extract(a, 0);
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
     let extractc = -extractc;
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132sd(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fmsub = constify_imm4_round!(rounding, call);
+    let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
 }
@@ -35824,26 +35584,21 @@ pub unsafe fn _mm_fmsub_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_round_sd&expand=2656)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f64 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(fmsub, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
@@ -35861,27 +35616,22 @@ pub unsafe fn _mm_mask_fmsub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_round_sd&expand=2658)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
@@ -35899,26 +35649,21 @@ pub unsafe fn _mm_maskz_fmsub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_round_sd&expand=2657)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f64 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
         let extractc = -fmsub;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(c, 0, fmsub);
     transmute(r)
@@ -35936,19 +35681,15 @@ pub unsafe fn _mm_mask3_fmsub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmadd_round_ss&expand=2739)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fnmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let extracta: f32 = simd_extract(a, 0);
     let extracta = -extracta;
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132ss(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fnmadd = constify_imm4_round!(rounding, call);
+    let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
 }
@@ -35965,26 +35706,21 @@ pub unsafe fn _mm_fnmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_round_ss&expand=2740)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fnmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f32 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmadd;
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
@@ -36002,27 +35738,22 @@ pub unsafe fn _mm_mask_fnmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_round_ss&expand=2742)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fnmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
@@ -36040,26 +35771,21 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_round_ss&expand=2741)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fnmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f32 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f32 = simd_extract(b, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, fnmadd, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING);
     }
     let r = simd_insert(c, 0, fnmadd);
     transmute(r)
@@ -36077,19 +35803,19 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmadd_round_sd&expand=2735)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fnmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let extracta: f64 = simd_extract(a, 0);
     let extracta = -extracta;
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132sd(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fnmadd = constify_imm4_round!(rounding, call);
+    let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
 }
@@ -36106,26 +35832,21 @@ pub unsafe fn _mm_fnmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_round_sd&expand=2736)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fnmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f64 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmadd;
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
@@ -36143,27 +35864,22 @@ pub unsafe fn _mm_mask_fnmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_round_sd&expand=2738)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fnmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
@@ -36181,26 +35897,21 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_round_Sd&expand=2737)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fnmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f64 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f64 = simd_extract(b, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, fnmadd, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING);
     }
     let r = simd_insert(c, 0, fnmadd);
     transmute(r)
@@ -36218,20 +35929,16 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmsub_round_ss&expand=2787)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fnmsub_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let extracta: f32 = simd_extract(a, 0);
     let extracta = -extracta;
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
     let extractc = -extractc;
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132ss(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fnmsub = constify_imm4_round!(rounding, call);
+    let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
 }
@@ -36248,27 +35955,22 @@ pub unsafe fn _mm_fnmsub_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_round_ss&expand=2788)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fnmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f32 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmsub;
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
@@ -36286,15 +35988,15 @@ pub unsafe fn _mm_mask_fnmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_round_ss&expand=2790)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fnmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
@@ -36302,12 +36004,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss(
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
@@ -36325,27 +36022,22 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_round_ss&expand=2789)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fnmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f32 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f32 = simd_extract(b, 0);
         let extractc = -fnmsub;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(c, 0, fnmsub);
     transmute(r)
@@ -36363,20 +36055,20 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmsub_round_sd&expand=2783)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fnmsub_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let extracta: f64 = simd_extract(a, 0);
     let extracta = -extracta;
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
     let extractc = -extractc;
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132sd(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fnmsub = constify_imm4_round!(rounding, call);
+    let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
 }
@@ -36393,27 +36085,22 @@ pub unsafe fn _mm_fnmsub_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_round_sd&expand=2784)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fnmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f64 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmsub;
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
@@ -36431,15 +36118,15 @@ pub unsafe fn _mm_mask_fnmsub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_round_sd&expand=2786)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fnmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
@@ -36447,12 +36134,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd(
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
@@ -36470,27 +36152,22 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_round_sd&expand=2785)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fnmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f64 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f64 = simd_extract(b, 0);
         let extractc = -fnmsub;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(c, 0, fnmsub);
     transmute(r)
@@ -36977,18 +36654,14 @@ pub unsafe fn _mm_maskz_cvt_roundss_sd(k: __mmask8, a: __m128d, b: __m128, sae:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_ss&expand=1361)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundsd_ss(a: __m128, b: __m128d, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsd_ss<const ROUNDING: i32>(a: __m128, b: __m128d) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f64x2();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2ss(a, b, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2ss(a, b, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -37003,24 +36676,19 @@ pub unsafe fn _mm_cvt_roundsd_ss(a: __m128, b: __m128d, rounding: i32) -> __m128
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundsd_ss&expand=1362)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_cvt_roundsd_ss(
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_cvt_roundsd_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128d,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f64x2();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2ss(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2ss(a, b, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -37035,23 +36703,18 @@ pub unsafe fn _mm_mask_cvt_roundsd_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvt_roundsd_ss&expand=1363)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_cvt_roundsd_ss(
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128d,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f64x2();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2ss(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2ss(a, b, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -37066,16 +36729,12 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_si32&expand=1374)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_si32(a: __m128, rounding: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2si(a, ROUNDING);
     transmute(r)
 }
 
@@ -37090,16 +36749,12 @@ pub unsafe fn _mm_cvt_roundss_si32(a: __m128, rounding: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_i32&expand=1369)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_i32(a: __m128, rounding: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2si(a, ROUNDING);
     transmute(r)
 }
 
@@ -37114,16 +36769,12 @@ pub unsafe fn _mm_cvt_roundss_i32(a: __m128, rounding: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_u32&expand=1376)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_u32(a: __m128, rounding: i32) -> u32 {
+#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2usi(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2usi(a, ROUNDING);
     transmute(r)
 }
 
@@ -37158,16 +36809,12 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_si32&expand=1359)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_si32(a: __m128d, rounding: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2si(a, ROUNDING);
     transmute(r)
 }
 
@@ -37182,16 +36829,12 @@ pub unsafe fn _mm_cvt_roundsd_si32(a: __m128d, rounding: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_i32&expand=1357)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d, rounding: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2si(a, ROUNDING);
     transmute(r)
 }
 
@@ -37206,16 +36849,12 @@ pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d, rounding: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=cvt_roundsd_u32&expand=1364)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_u32(a: __m128d, rounding: i32) -> u32 {
+#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2usi(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2usi(a, ROUNDING);
     transmute(r)
 }
 
@@ -37251,16 +36890,12 @@ pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundi32_ss&expand=1312)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2ss(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2ss(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -37276,16 +36911,12 @@ pub unsafe fn _mm_cvt_roundi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsi32_ss&expand=1366)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundsi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2ss(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2ss(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -37300,16 +36931,12 @@ pub unsafe fn _mm_cvt_roundsi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundu32_ss&expand=1378)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtusi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundu32_ss(a: __m128, b: u32, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtusi2ss(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtusi2ss(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -44343,10 +43970,10 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
         assert_eq_m512i(r, e);
-        let r = _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
@@ -44357,14 +43984,14 @@ mod tests {
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
         let src = _mm512_set1_epi32(0);
-        let r =
-            _mm512_mask_cvt_roundps_epi32(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvt_roundps_epi32(
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             src,
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
@@ -44375,12 +44002,13 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_maskz_cvt_roundps_epi32(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvt_roundps_epi32(
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
@@ -44391,10 +44019,10 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_cvt_roundps_epu32(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
         assert_eq_m512i(r, e);
-        let r = _mm512_cvt_roundps_epu32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
@@ -44405,14 +44033,14 @@ mod tests {
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
         let src = _mm512_set1_epi32(0);
-        let r =
-            _mm512_mask_cvt_roundps_epu32(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvt_roundps_epu32(
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             src,
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
@@ -44423,12 +44051,13 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_maskz_cvt_roundps_epu32(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvt_roundps_epu32(
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
@@ -44437,7 +44066,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundepi32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_cvt_roundepi32_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
         let e = _mm512_setr_ps(
             0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16.,
         );
@@ -44448,14 +44077,14 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundepi32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
         let src = _mm512_set1_ps(0.);
-        let r =
-            _mm512_mask_cvt_roundepi32_ps(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
         assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundepi32_ps(
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             src,
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_ps(
             0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
@@ -44466,12 +44095,13 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundepi32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_maskz_cvt_roundepi32_ps(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
         assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundepi32_ps(
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_ps(
             0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
@@ -44482,7 +44112,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundepu32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_cvt_roundepu32_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
         #[rustfmt::skip]
         let e = _mm512_setr_ps(
             0., 4294967300., 2., 4294967300.,
@@ -44497,14 +44127,14 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundepu32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
         let src = _mm512_set1_ps(0.);
-        let r =
-            _mm512_mask_cvt_roundepu32_ps(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
         assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundepu32_ps(
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             src,
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         #[rustfmt::skip]
         let e = _mm512_setr_ps(
@@ -44519,12 +44149,13 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundepu32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_maskz_cvt_roundepu32_ps(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
         assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundepu32_ps(
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         #[rustfmt::skip]
         let e = _mm512_setr_ps(
@@ -52671,7 +52302,7 @@ mod tests {
     unsafe fn test_mm_add_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_add_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 2., 10., 60.);
         assert_eq_m128(r, e);
     }
@@ -52681,15 +52312,11 @@ mod tests {
         let src = _mm_set_ps(10., 11., 100., 110.);
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_add_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_ps(1., 2., 10., 110.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_add_round_ss(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 2., 10., 60.);
         assert_eq_m128(r, e);
@@ -52699,10 +52326,11 @@ mod tests {
     unsafe fn test_mm_maskz_add_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_add_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_add_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_ps(1., 2., 10., 60.);
         assert_eq_m128(r, e);
     }
@@ -52711,7 +52339,7 @@ mod tests {
     unsafe fn test_mm_add_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_add_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 6.);
         assert_eq_m128d(r, e);
     }
@@ -52721,15 +52349,11 @@ mod tests {
         let src = _mm_set_pd(10., 11.);
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_add_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_pd(1., 11.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_add_round_sd(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 6.);
         assert_eq_m128d(r, e);
@@ -52739,10 +52363,11 @@ mod tests {
     unsafe fn test_mm_maskz_add_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_add_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_add_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_pd(1., 6.);
         assert_eq_m128d(r, e);
     }
@@ -52751,7 +52376,7 @@ mod tests {
     unsafe fn test_mm_sub_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_sub_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 2., 10., -20.);
         assert_eq_m128(r, e);
     }
@@ -52761,15 +52386,11 @@ mod tests {
         let src = _mm_set_ps(10., 11., 100., 110.);
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_sub_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_ps(1., 2., 10., 110.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_sub_round_ss(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 2., 10., -20.);
         assert_eq_m128(r, e);
@@ -52779,10 +52400,11 @@ mod tests {
     unsafe fn test_mm_maskz_sub_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_sub_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_sub_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_ps(1., 2., 10., -20.);
         assert_eq_m128(r, e);
     }
@@ -52791,7 +52413,7 @@ mod tests {
     unsafe fn test_mm_sub_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_sub_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., -2.);
         assert_eq_m128d(r, e);
     }
@@ -52801,15 +52423,11 @@ mod tests {
         let src = _mm_set_pd(10., 11.);
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_sub_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_pd(1., 11.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_sub_round_sd(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., -2.);
         assert_eq_m128d(r, e);
@@ -52819,10 +52437,11 @@ mod tests {
     unsafe fn test_mm_maskz_sub_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_sub_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_sub_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_pd(1., -2.);
         assert_eq_m128d(r, e);
     }
@@ -52831,7 +52450,7 @@ mod tests {
     unsafe fn test_mm_mul_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mul_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 2., 10., 800.);
         assert_eq_m128(r, e);
     }
@@ -52841,15 +52460,11 @@ mod tests {
         let src = _mm_set_ps(10., 11., 100., 110.);
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_mul_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_ps(1., 2., 10., 110.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_mul_round_ss(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 2., 10., 800.);
         assert_eq_m128(r, e);
@@ -52859,10 +52474,11 @@ mod tests {
     unsafe fn test_mm_maskz_mul_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_mul_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_mul_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_ps(1., 2., 10., 800.);
         assert_eq_m128(r, e);
     }
@@ -52871,7 +52487,7 @@ mod tests {
     unsafe fn test_mm_mul_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mul_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
     }
@@ -52881,15 +52497,11 @@ mod tests {
         let src = _mm_set_pd(10., 11.);
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_mul_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_pd(1., 11.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_mul_round_sd(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
@@ -52899,10 +52511,11 @@ mod tests {
     unsafe fn test_mm_maskz_mul_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_mul_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_mul_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
     }
@@ -52911,7 +52524,7 @@ mod tests {
     unsafe fn test_mm_div_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_div_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 2., 10., 0.5);
         assert_eq_m128(r, e);
     }
@@ -52921,15 +52534,11 @@ mod tests {
         let src = _mm_set_ps(10., 11., 100., 110.);
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_div_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_ps(1., 2., 10., 110.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_div_round_ss(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 2., 10., 0.5);
         assert_eq_m128(r, e);
@@ -52939,10 +52548,11 @@ mod tests {
     unsafe fn test_mm_maskz_div_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_div_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_div_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.5);
         assert_eq_m128(r, e);
     }
@@ -52951,7 +52561,7 @@ mod tests {
     unsafe fn test_mm_div_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_div_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 0.5);
         assert_eq_m128d(r, e);
     }
@@ -52961,15 +52571,11 @@ mod tests {
         let src = _mm_set_pd(10., 11.);
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_div_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_pd(1., 11.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_div_round_sd(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 0.5);
         assert_eq_m128d(r, e);
@@ -52979,10 +52585,11 @@ mod tests {
     unsafe fn test_mm_maskz_div_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_div_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_div_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_pd(1., 0.5);
         assert_eq_m128d(r, e);
     }
@@ -53123,7 +52730,7 @@ mod tests {
     unsafe fn test_mm_sqrt_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_sqrt_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 2., 10., 2.);
         assert_eq_m128(r, e);
     }
@@ -53133,15 +52740,11 @@ mod tests {
         let src = _mm_set_ps(10., 11., 100., 110.);
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_mask_sqrt_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_ps(1., 2., 10., 110.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_sqrt_round_ss(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 2., 10., 2.);
         assert_eq_m128(r, e);
@@ -53151,10 +52754,11 @@ mod tests {
     unsafe fn test_mm_maskz_sqrt_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_maskz_sqrt_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_sqrt_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_ps(1., 2., 10., 2.);
         assert_eq_m128(r, e);
     }
@@ -53163,7 +52767,7 @@ mod tests {
     unsafe fn test_mm_sqrt_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_sqrt_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 2.);
         assert_eq_m128d(r, e);
     }
@@ -53173,15 +52777,11 @@ mod tests {
         let src = _mm_set_pd(10., 11.);
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_sqrt_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_pd(1., 11.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_sqrt_round_sd(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 2.);
         assert_eq_m128d(r, e);
@@ -53191,10 +52791,11 @@ mod tests {
     unsafe fn test_mm_maskz_sqrt_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_sqrt_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_sqrt_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_pd(1., 2.);
         assert_eq_m128d(r, e);
     }
@@ -53473,7 +53074,7 @@ mod tests {
     unsafe fn test_mm_scalef_round_ss() {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_scalef_round_ss(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 1., 1., 8.);
         assert_eq_m128(r, e);
     }
@@ -53482,15 +53083,13 @@ mod tests {
     unsafe fn test_mm_mask_scalef_round_ss() {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_mask_scalef_round_ss(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
         let e = _mm_set_ps(1., 1., 1., 1.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_scalef_round_ss(
-            a,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 1., 1., 8.);
         assert_eq_m128(r, e);
@@ -53500,14 +53099,12 @@ mod tests {
     unsafe fn test_mm_maskz_scalef_round_ss() {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_maskz_scalef_round_ss(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 1., 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_scalef_round_ss(
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 1., 1., 8.);
         assert_eq_m128(r, e);
@@ -53517,7 +53114,7 @@ mod tests {
     unsafe fn test_mm_scalef_round_sd() {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_scalef_round_sd(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
     }
@@ -53526,15 +53123,13 @@ mod tests {
     unsafe fn test_mm_mask_scalef_round_sd() {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_mask_scalef_round_sd(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
         let e = _mm_set_pd(1., 1.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_scalef_round_sd(
-            a,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
@@ -53544,14 +53139,12 @@ mod tests {
     unsafe fn test_mm_maskz_scalef_round_sd() {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_maskz_scalef_round_sd(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_scalef_round_sd(
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
@@ -53562,7 +53155,7 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_fmadd_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_ps(1., 1., 1., 5.);
         assert_eq_m128(r, e);
     }
@@ -53572,14 +53165,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fmadd_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128(r, a);
-        let r = _mm_mask_fmadd_round_ss(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., 5.);
         assert_eq_m128(r, e);
@@ -53590,15 +53181,13 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fmadd_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_ps(1., 1., 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_fmadd_round_ss(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., 5.);
         assert_eq_m128(r, e);
@@ -53609,14 +53198,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fmadd_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128(r, c);
-        let r = _mm_mask3_fmadd_round_ss(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_ps(3., 3., 3., 5.);
         assert_eq_m128(r, e);
@@ -53627,7 +53214,7 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_fmadd_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_pd(1., 5.);
         assert_eq_m128d(r, e);
     }
@@ -53637,14 +53224,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fmadd_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128d(r, a);
-        let r = _mm_mask_fmadd_round_sd(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_pd(1., 5.);
         assert_eq_m128d(r, e);
@@ -53655,15 +53240,13 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fmadd_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_fmadd_round_sd(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_pd(1., 5.);
         assert_eq_m128d(r, e);
@@ -53674,14 +53257,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fmadd_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128d(r, c);
-        let r = _mm_mask3_fmadd_round_sd(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_pd(3., 5.);
         assert_eq_m128d(r, e);
@@ -53692,7 +53273,7 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_fmsub_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_ps(1., 1., 1., -1.);
         assert_eq_m128(r, e);
     }
@@ -53702,14 +53283,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fmsub_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128(r, a);
-        let r = _mm_mask_fmsub_round_ss(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., -1.);
         assert_eq_m128(r, e);
@@ -53720,15 +53299,13 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fmsub_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_ps(1., 1., 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_fmsub_round_ss(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., -1.);
         assert_eq_m128(r, e);
@@ -53739,14 +53316,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fmsub_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128(r, c);
-        let r = _mm_mask3_fmsub_round_ss(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_ps(3., 3., 3., -1.);
         assert_eq_m128(r, e);
@@ -53757,7 +53332,7 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_fmsub_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_pd(1., -1.);
         assert_eq_m128d(r, e);
     }
@@ -53767,14 +53342,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fmsub_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128d(r, a);
-        let r = _mm_mask_fmsub_round_sd(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_pd(1., -1.);
         assert_eq_m128d(r, e);
@@ -53785,15 +53358,13 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fmsub_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_fmsub_round_sd(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_pd(1., -1.);
         assert_eq_m128d(r, e);
@@ -53804,14 +53375,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fmsub_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128d(r, c);
-        let r = _mm_mask3_fmsub_round_sd(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_pd(3., -1.);
         assert_eq_m128d(r, e);
@@ -53822,7 +53391,7 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_fnmadd_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_ps(1., 1., 1., 1.);
         assert_eq_m128(r, e);
     }
@@ -53832,14 +53401,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fnmadd_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128(r, a);
-        let r = _mm_mask_fnmadd_round_ss(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., 1.);
         assert_eq_m128(r, e);
@@ -53850,16 +53417,13 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r =
-            _mm_maskz_fnmadd_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_ps(1., 1., 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_fnmadd_round_ss(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., 1.);
         assert_eq_m128(r, e);
@@ -53870,15 +53434,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r =
-            _mm_mask3_fnmadd_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128(r, c);
-        let r = _mm_mask3_fnmadd_round_ss(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_ps(3., 3., 3., 1.);
         assert_eq_m128(r, e);
@@ -53889,7 +53450,7 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_fnmadd_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_pd(1., 1.);
         assert_eq_m128d(r, e);
     }
@@ -53899,14 +53460,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fnmadd_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128d(r, a);
-        let r = _mm_mask_fnmadd_round_sd(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_pd(1., 1.);
         assert_eq_m128d(r, e);
@@ -53917,16 +53476,13 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r =
-            _mm_maskz_fnmadd_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_fnmadd_round_sd(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_pd(1., 1.);
         assert_eq_m128d(r, e);
@@ -53937,15 +53493,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r =
-            _mm_mask3_fnmadd_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128d(r, c);
-        let r = _mm_mask3_fnmadd_round_sd(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_pd(3., 1.);
         assert_eq_m128d(r, e);
@@ -53956,7 +53509,7 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_fnmsub_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_ps(1., 1., 1., -5.);
         assert_eq_m128(r, e);
     }
@@ -53966,14 +53519,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fnmsub_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128(r, a);
-        let r = _mm_mask_fnmsub_round_ss(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., -5.);
         assert_eq_m128(r, e);
@@ -53984,16 +53535,13 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r =
-            _mm_maskz_fnmsub_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_ps(1., 1., 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_fnmsub_round_ss(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., -5.);
         assert_eq_m128(r, e);
@@ -54004,15 +53552,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r =
-            _mm_mask3_fnmsub_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128(r, c);
-        let r = _mm_mask3_fnmsub_round_ss(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_ps(3., 3., 3., -5.);
         assert_eq_m128(r, e);
@@ -54023,7 +53568,7 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_fnmsub_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_pd(1., -5.);
         assert_eq_m128d(r, e);
     }
@@ -54033,14 +53578,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fnmsub_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128d(r, a);
-        let r = _mm_mask_fnmsub_round_sd(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_pd(1., -5.);
         assert_eq_m128d(r, e);
@@ -54051,16 +53594,13 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r =
-            _mm_maskz_fnmsub_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_fnmsub_round_sd(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_pd(1., -5.);
         assert_eq_m128d(r, e);
@@ -54071,15 +53611,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r =
-            _mm_mask3_fnmsub_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128d(r, c);
-        let r = _mm_mask3_fnmsub_round_sd(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_pd(3., -5.);
         assert_eq_m128d(r, e);
@@ -54299,7 +53836,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundsd_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b = _mm_set_pd(6., -7.5);
-        let r = _mm_cvt_roundsd_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., -7.5);
         assert_eq_m128(r, e);
     }
@@ -54308,10 +53845,11 @@ mod tests {
     unsafe fn test_mm_mask_cvt_roundsd_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b = _mm_set_pd(6., -7.5);
-        let r = _mm_mask_cvt_roundsd_ss(a, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
         assert_eq_m128(r, a);
-        let r =
-            _mm_mask_cvt_roundsd_ss(a, 0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
         let e = _mm_set_ps(0., -0.5, 1., -7.5);
         assert_eq_m128(r, e);
     }
@@ -54320,10 +53858,12 @@ mod tests {
     unsafe fn test_mm_maskz_cvt_roundsd_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b = _mm_set_pd(6., -7.5);
-        let r = _mm_maskz_cvt_roundsd_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(0., -0.5, 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_cvt_roundsd_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
         let e = _mm_set_ps(0., -0.5, 1., -7.5);
         assert_eq_m128(r, e);
     }
@@ -54331,7 +53871,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_si32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_si32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i32 = -1;
         assert_eq!(r, e);
     }
@@ -54339,7 +53879,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_i32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_i32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i32 = -1;
         assert_eq!(r, e);
     }
@@ -54347,7 +53887,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_u32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: u32 = u32::MAX;
         assert_eq!(r, e);
     }
@@ -54371,7 +53911,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_si32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_si32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i32 = -1;
         assert_eq!(r, e);
     }
@@ -54379,7 +53919,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_i32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_i32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i32 = -1;
         assert_eq!(r, e);
     }
@@ -54387,7 +53927,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_u32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_u32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: u32 = u32::MAX;
         assert_eq!(r, e);
     }
@@ -54412,7 +53952,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundi32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: i32 = 9;
-        let r = _mm_cvt_roundi32_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
@@ -54421,7 +53961,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundsi32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: i32 = 9;
-        let r = _mm_cvt_roundsi32_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
@@ -54430,7 +53970,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundu32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: u32 = 9;
-        let r = _mm_cvt_roundu32_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundu32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index caaf3e6d73..ae6202bc73 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -6288,7 +6288,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundpd_ps() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvt_roundpd_ps(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(a);
         let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         assert_eq_m256(r, e);
     }
@@ -6297,9 +6297,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundpd_ps() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm256_set1_ps(0.);
-        let r = _mm512_mask_cvt_roundpd_ps(src, 0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
         assert_eq_m256(r, src);
-        let r = _mm512_mask_cvt_roundpd_ps(src, 0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
         let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
         assert_eq_m256(r, e);
     }
@@ -6307,9 +6307,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundpd_ps() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvt_roundpd_ps(0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(0, a);
         assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm512_maskz_cvt_roundpd_ps(0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
         let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
         assert_eq_m256(r, e);
     }
@@ -6317,7 +6317,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvt_roundpd_epi32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
         assert_eq_m256i(r, e);
     }
@@ -6326,9 +6326,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm256_set1_epi32(0);
-        let r = _mm512_mask_cvt_roundpd_epi32(src, 0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvt_roundpd_epi32(src, 0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
         let e = _mm256_setr_epi32(0, -2, 2, -4, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -6336,9 +6336,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvt_roundpd_epi32(0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvt_roundpd_epi32(0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
         let e = _mm256_setr_epi32(0, -2, 2, -4, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -6346,7 +6346,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvt_roundpd_epu32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1);
         assert_eq_m256i(r, e);
     }
@@ -6355,9 +6355,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm256_set1_epi32(0);
-        let r = _mm512_mask_cvt_roundpd_epu32(src, 0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvt_roundpd_epu32(src, 0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -6365,9 +6365,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvt_roundpd_epu32(0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvt_roundpd_epu32(0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }

From a4a01fec8e75f04542b9158d69316c7df73bd3c5 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 15:46:38 +0000
Subject: [PATCH 002/123] cvt_roundps_pd; cvt_roundps_ph; cvt_roundph_ps;
 cvtps_ph; cvtt_roundps,pd_epi32,epu32; mm_max,min_round_ss,sd;
 mm_getexp_ss,sd; mm_cvt_roundss_sd; cvt_roundss_si32,i32,u32;
 mm_cvtt_roundsd_si32,i32,u32

---
 crates/core_arch/src/x86/avx512f.rs    | 905 ++++++++++---------------
 crates/core_arch/src/x86_64/avx512f.rs |  30 +-
 2 files changed, 375 insertions(+), 560 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7911157eb2..bcd826d700 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -13533,17 +13533,13 @@ pub unsafe fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_pd&expand=1347)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2pd, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundps_pd(a: __m256, sae: i32) -> __m512d {
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_pd<const SAE: i32>(a: __m256) -> __m512d {
+    static_assert_sae!(SAE);
     let a = a.as_f32x8();
     let zero = _mm512_setzero_pd().as_f64x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2pd(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2pd(a, zero, 0b11111111, SAE);
     transmute(r)
 }
 
@@ -13553,22 +13549,17 @@ pub unsafe fn _mm512_cvt_roundps_pd(a: __m256, sae: i32) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2pd, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_pd(
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_pd<const SAE: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m256,
-    sae: i32,
 ) -> __m512d {
+    static_assert_sae!(SAE);
     let a = a.as_f32x8();
     let src = src.as_f64x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2pd(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2pd(a, src, k, SAE);
     transmute(r)
 }
 
@@ -13578,17 +13569,13 @@ pub unsafe fn _mm512_mask_cvt_roundps_pd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2pd, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_pd(k: __mmask8, a: __m256, sae: i32) -> __m512d {
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_pd<const SAE: i32>(k: __mmask8, a: __m256) -> __m512d {
+    static_assert_sae!(SAE);
     let a = a.as_f32x8();
     let zero = _mm512_setzero_pd().as_f64x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2pd(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2pd(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -13956,17 +13943,13 @@ pub unsafe fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_ph&expand=1354)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundps_ph(a: __m512, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_ph<const SAE: i32>(a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm256_setzero_si256().as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, zero, 0b11111111_11111111)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, zero, 0b11111111_11111111);
     transmute(r)
 }
 
@@ -13976,22 +13959,17 @@ pub unsafe fn _mm512_cvt_roundps_ph(a: __m512, sae: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_ph&expand=1355)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_ph(
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_ph<const SAE: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m512,
-    sae: i32,
 ) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let src = src.as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, src, k)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, src, k);
     transmute(r)
 }
 
@@ -14001,17 +13979,13 @@ pub unsafe fn _mm512_mask_cvt_roundps_ph(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_ph&expand=1356)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_ph(k: __mmask16, a: __m512, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm256_setzero_si256().as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, zero, k)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, zero, k);
     transmute(r)
 }
 
@@ -14126,17 +14100,13 @@ pub unsafe fn _mm_maskz_cvt_roundps_ph(k: __mmask8, a: __m128, imm8: i32) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_ph&expand=1778)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvtps_ph(a: __m512, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtps_ph<const SAE: i32>(a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm256_setzero_si256().as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, zero, 0b11111111_11111111)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, zero, 0b11111111_11111111);
     transmute(r)
 }
 
@@ -14146,17 +14116,17 @@ pub unsafe fn _mm512_cvtps_ph(a: __m512, sae: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtps_ph&expand=1779)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvtps_ph(src: __m256i, k: __mmask16, a: __m512, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtps_ph<const SAE: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m512,
+) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let src = src.as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, src, k)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, src, k);
     transmute(r)
 }
 
@@ -14166,17 +14136,13 @@ pub unsafe fn _mm512_mask_cvtps_ph(src: __m256i, k: __mmask16, a: __m512, sae: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_ph&expand=1780)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvtps_ph(k: __mmask16, a: __m512, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm256_setzero_si256().as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, zero, k)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, zero, k);
     transmute(r)
 }
 
@@ -14286,17 +14252,13 @@ pub unsafe fn _mm_maskz_cvtps_ph(k: __mmask8, a: __m128, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_ps&expand=1332)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtph2ps, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundph_ps(a: __m256i, sae: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundph_ps<const SAE: i32>(a: __m256i) -> __m512 {
+    static_assert_sae!(SAE);
     let a = a.as_i16x16();
     let zero = _mm512_setzero_ps().as_f32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtph2ps(a, zero, 0b11111111_11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtph2ps(a, zero, 0b11111111_11111111, SAE);
     transmute(r)
 }
 
@@ -14306,22 +14268,17 @@ pub unsafe fn _mm512_cvt_roundph_ps(a: __m256i, sae: i32) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_ps&expand=1333)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtph2ps, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundph_ps(
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundph_ps<const SAE: i32>(
     src: __m512,
     k: __mmask16,
     a: __m256i,
-    sae: i32,
 ) -> __m512 {
+    static_assert_sae!(SAE);
     let a = a.as_i16x16();
     let src = src.as_f32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtph2ps(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtph2ps(a, src, k, SAE);
     transmute(r)
 }
 
@@ -14331,17 +14288,13 @@ pub unsafe fn _mm512_mask_cvt_roundph_ps(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_ps&expand=1334)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtph2ps, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundph_ps(k: __mmask16, a: __m256i, sae: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256i) -> __m512 {
+    static_assert_sae!(SAE);
     let a = a.as_i16x16();
     let zero = _mm512_setzero_ps().as_f32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtph2ps(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtph2ps(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -14442,17 +14395,13 @@ pub unsafe fn _mm_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epi32&expand=1916)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvtt_roundps_epi32(a: __m512, sae: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2dq(a, zero, 0b11111111_11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2dq(a, zero, 0b11111111_11111111, SAE);
     transmute(r)
 }
 
@@ -14462,22 +14411,17 @@ pub unsafe fn _mm512_cvtt_roundps_epi32(a: __m512, sae: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epi32&expand=1917)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvtt_roundps_epi32(
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundps_epi32<const SAE: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
-    sae: i32,
 ) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let src = src.as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2dq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2dq(a, src, k, SAE);
     transmute(r)
 }
 
@@ -14487,17 +14431,13 @@ pub unsafe fn _mm512_mask_cvtt_roundps_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundps_epi32(k: __mmask16, a: __m512, sae: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundps_epi32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2dq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2dq(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -14507,17 +14447,13 @@ pub unsafe fn _mm512_maskz_cvtt_roundps_epi32(k: __mmask16, a: __m512, sae: i32)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epu32&expand=1922)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvtt_roundps_epu32(a: __m512, sae: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2udq(a, zero, 0b11111111_11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2udq(a, zero, 0b11111111_11111111, SAE);
     transmute(r)
 }
 
@@ -14527,22 +14463,17 @@ pub unsafe fn _mm512_cvtt_roundps_epu32(a: __m512, sae: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epu32&expand=1923)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvtt_roundps_epu32(
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundps_epu32<const SAE: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
-    sae: i32,
 ) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let src = src.as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2udq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2udq(a, src, k, SAE);
     transmute(r)
 }
 
@@ -14552,17 +14483,13 @@ pub unsafe fn _mm512_mask_cvtt_roundps_epu32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epu32&expand=1924)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundps_epu32(k: __mmask16, a: __m512, sae: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundps_epu32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2udq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2udq(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -14572,17 +14499,13 @@ pub unsafe fn _mm512_maskz_cvtt_roundps_epu32(k: __mmask16, a: __m512, sae: i32)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epi32&expand=1904)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvtt_roundpd_epi32(a: __m512d, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2dq(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2dq(a, zero, 0b11111111, SAE);
     transmute(r)
 }
 
@@ -14592,22 +14515,17 @@ pub unsafe fn _mm512_cvtt_roundpd_epi32(a: __m512d, sae: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epi32&expand=1905)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvtt_roundpd_epi32(
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundpd_epi32<const SAE: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
-    sae: i32,
 ) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let src = src.as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2dq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2dq(a, src, k, SAE);
     transmute(r)
 }
 
@@ -14617,17 +14535,13 @@ pub unsafe fn _mm512_mask_cvtt_roundpd_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32(k: __mmask8, a: __m512d, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2dq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2dq(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -14637,17 +14551,13 @@ pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32(k: __mmask8, a: __m512d, sae: i32)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epu32&expand=1910)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvtt_roundpd_epu32(a: __m512d, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2udq(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2udq(a, zero, 0b11111111, SAE);
     transmute(r)
 }
 
@@ -14657,22 +14567,17 @@ pub unsafe fn _mm512_cvtt_roundpd_epu32(a: __m512d, sae: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epu32&expand=1911)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvtt_roundpd_epu32(
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundpd_epu32<const SAE: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
-    sae: i32,
 ) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let src = src.as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2udq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2udq(a, src, k, SAE);
     transmute(r)
 }
 
@@ -14896,17 +14801,13 @@ pub unsafe fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundpd_epu32&expand=1912)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundpd_epu32(k: __mmask8, a: __m512d, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundpd_epu32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2udq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2udq(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -34064,18 +33965,15 @@ pub unsafe fn _mm_maskz_div_round_sd<const ROUNDING: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxss(a, b, zero, 0b1, SAE);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34084,24 +33982,20 @@ pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_max_round_ss(
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_max_round_ss<const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    sae: i32,
 ) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxss(a, b, src, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34110,18 +34004,15 @@ pub unsafe fn _mm_mask_max_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxss(a, b, zero, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -34130,18 +34021,15 @@ pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxsd(a, b, zero, 0b1, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34150,24 +34038,20 @@ pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_max_round_sd(
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_max_round_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    sae: i32,
 ) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxsd(a, b, src, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34176,18 +34060,19 @@ pub unsafe fn _mm_mask_max_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_max_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxsd(a, b, zero, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34196,18 +34081,15 @@ pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminss(a, b, zero, 0b1, SAE);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34216,24 +34098,20 @@ pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_min_round_ss(
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_min_round_ss<const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    sae: i32,
 ) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminss(a, b, src, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34242,18 +34120,15 @@ pub unsafe fn _mm_mask_min_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminss(a, b, zero, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
@@ -34262,18 +34137,15 @@ pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminsd(a, b, zero, 0b1, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34282,24 +34154,20 @@ pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_min_round_sd(
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_min_round_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    sae: i32,
 ) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminsd(a, b, src, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34308,18 +34176,19 @@ pub unsafe fn _mm_mask_min_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_min_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminsd(a, b, zero, k, SAE);
+    transmute(r)
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34484,18 +34353,14 @@ pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpss(a, b, zero, 0b1, SAE);
     transmute(r)
 }
 
@@ -34505,24 +34370,19 @@ pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_getexp_round_ss(
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_getexp_round_ss<const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    sae: i32,
 ) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpss(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpss(a, b, src, k, SAE);
     transmute(r)
 }
 
@@ -34532,18 +34392,18 @@ pub unsafe fn _mm_mask_getexp_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_getexp_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_getexp_round_ss<const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpss(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpss(a, b, zero, k, SAE);
     transmute(r)
 }
 
@@ -34553,18 +34413,14 @@ pub unsafe fn _mm_maskz_getexp_round_ss(k: __mmask8, a: __m128, b: __m128, sae:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpsd(a, b, zero, 0b1, SAE);
     transmute(r)
 }
 
@@ -34574,24 +34430,19 @@ pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_getexp_round_sd(
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_getexp_round_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    sae: i32,
 ) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpsd(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpsd(a, b, src, k, SAE);
     transmute(r)
 }
 
@@ -34601,18 +34452,18 @@ pub unsafe fn _mm_mask_getexp_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_getexp_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_getexp_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpsd(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpsd(a, b, zero, k, SAE);
     transmute(r)
 }
 
@@ -36577,21 +36428,14 @@ pub unsafe fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_sd&expand=1371)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2sd, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundss_sd(a: __m128d, b: __m128, sae: i32) -> __m128d {
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2sd(
-                a.as_f64x2(),
-                b.as_f32x4(),
-                _mm_setzero_pd().as_f64x2(),
-                0b11111111,
-                $imm4,
-            )
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundss_sd<const SAE: i32>(a: __m128d, b: __m128) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vcvtss2sd(a, b, zero, 0b11111111, SAE);
     transmute(r)
 }
 
@@ -36601,24 +36445,19 @@ pub unsafe fn _mm_cvt_roundss_sd(a: __m128d, b: __m128, sae: i32) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundss_sd&expand=1372)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2sd, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_cvt_roundss_sd(
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_cvt_roundss_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128,
-    sae: i32,
 ) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f32x4();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2sd(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2sd(a, b, src, k, SAE);
     transmute(r)
 }
 
@@ -36628,18 +36467,18 @@ pub unsafe fn _mm_mask_cvt_roundss_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvt_roundss_sd&expand=1373)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2sd, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_cvt_roundss_sd(k: __mmask8, a: __m128d, b: __m128, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_cvt_roundss_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128,
+) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f32x4();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2sd(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2sd(a, b, zero, k, SAE);
     transmute(r)
 }
 
@@ -36970,16 +36809,12 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_Si32&expand=1936)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_si32(a: __m128, sae: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2si(a, SAE);
     transmute(r)
 }
 
@@ -36989,16 +36824,12 @@ pub unsafe fn _mm_cvtt_roundss_si32(a: __m128, sae: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_i32&expand=1934)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_i32(a: __m128, sae: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2si(a, SAE);
     transmute(r)
 }
 
@@ -37008,16 +36839,12 @@ pub unsafe fn _mm_cvtt_roundss_i32(a: __m128, sae: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_u32&expand=1938)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_u32(a: __m128, sae: i32) -> u32 {
+#[cfg_attr(test, assert_instr(vcvtss2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2usi(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2usi(a, SAE);
     transmute(r)
 }
 
@@ -37047,16 +36874,12 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si32&expand=1930)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_si32(a: __m128d, sae: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2si(a, SAE);
     transmute(r)
 }
 
@@ -37066,16 +36889,12 @@ pub unsafe fn _mm_cvtt_roundsd_si32(a: __m128d, sae: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i32&expand=1928)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d, sae: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2si(a, SAE);
     transmute(r)
 }
 
@@ -37085,16 +36904,12 @@ pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d, sae: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundsd_u32&expand=1932)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_u32(a: __m128d, sae: i32) -> u32 {
+#[cfg_attr(test, assert_instr(vcvtsd2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2usi(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2usi(a, SAE);
     transmute(r)
 }
 
@@ -44170,7 +43985,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundps_ph() {
         let a = _mm512_set1_ps(1.);
-        let r = _mm512_cvt_roundps_ph(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a);
         let e = _mm256_setr_epi64x(
             4323521613979991040,
             4323521613979991040,
@@ -44184,9 +43999,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundps_ph() {
         let a = _mm512_set1_ps(1.);
         let src = _mm256_set1_epi16(0);
-        let r = _mm512_mask_cvt_roundps_ph(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvt_roundps_ph(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -44194,9 +44009,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundps_ph() {
         let a = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_cvt_roundps_ph(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvt_roundps_ph(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -44246,7 +44061,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtps_ph() {
         let a = _mm512_set1_ps(1.);
-        let r = _mm512_cvtps_ph(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a);
         let e = _mm256_setr_epi64x(
             4323521613979991040,
             4323521613979991040,
@@ -44260,9 +44075,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvtps_ph() {
         let a = _mm512_set1_ps(1.);
         let src = _mm256_set1_epi16(0);
-        let r = _mm512_mask_cvtps_ph(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtps_ph(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -44270,9 +44085,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvtps_ph() {
         let a = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_cvtps_ph(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtps_ph(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -44327,7 +44142,7 @@ mod tests {
             4323521613979991040,
             4323521613979991040,
         );
-        let r = _mm512_cvt_roundph_ps(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a);
         let e = _mm512_set1_ps(1.);
         assert_eq_m512(r, e);
     }
@@ -44341,9 +44156,9 @@ mod tests {
             4323521613979991040,
         );
         let src = _mm512_set1_ps(0.);
-        let r = _mm512_mask_cvt_roundph_ps(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundph_ps(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
         let e = _mm512_setr_ps(
             1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         );
@@ -44358,9 +44173,9 @@ mod tests {
             4323521613979991040,
             4323521613979991040,
         );
-        let r = _mm512_maskz_cvt_roundph_ps(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundph_ps(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
         let e = _mm512_setr_ps(
             1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         );
@@ -44462,7 +44277,7 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_cvtt_roundps_epi32(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a);
         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
@@ -44473,9 +44288,9 @@ mod tests {
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
         let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundps_epi32(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtt_roundps_epi32(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -44485,9 +44300,9 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_maskz_cvtt_roundps_epi32(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtt_roundps_epi32(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -44497,7 +44312,7 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_cvtt_roundps_epu32(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a);
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
@@ -44508,9 +44323,9 @@ mod tests {
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
         let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundps_epu32(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtt_roundps_epu32(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -44520,9 +44335,9 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_maskz_cvtt_roundps_epu32(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtt_roundps_epu32(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -52598,7 +52413,7 @@ mod tests {
     unsafe fn test_mm_max_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_max_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_ps(0., 1., 2., 7.);
         assert_eq_m128(r, e);
     }
@@ -52607,10 +52422,10 @@ mod tests {
     unsafe fn test_mm_mask_max_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_mask_max_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_ps(0., 1., 2., 3.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_max_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_ps(0., 1., 2., 7.);
         assert_eq_m128(r, e);
     }
@@ -52619,10 +52434,10 @@ mod tests {
     unsafe fn test_mm_maskz_max_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_maskz_max_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_ps(0., 1., 2., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_max_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_ps(0., 1., 2., 7.);
         assert_eq_m128(r, e);
     }
@@ -52631,7 +52446,7 @@ mod tests {
     unsafe fn test_mm_max_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_max_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_pd(0., 3.);
         assert_eq_m128d(r, e);
     }
@@ -52640,10 +52455,10 @@ mod tests {
     unsafe fn test_mm_mask_max_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_mask_max_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_pd(0., 1.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_max_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_pd(0., 3.);
         assert_eq_m128d(r, e);
     }
@@ -52652,10 +52467,10 @@ mod tests {
     unsafe fn test_mm_maskz_max_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_maskz_max_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_pd(0., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_max_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_pd(0., 3.);
         assert_eq_m128d(r, e);
     }
@@ -52664,7 +52479,7 @@ mod tests {
     unsafe fn test_mm_min_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_min_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_ps(0., 1., 2., 3.);
         assert_eq_m128(r, e);
     }
@@ -52673,10 +52488,10 @@ mod tests {
     unsafe fn test_mm_mask_min_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_mask_min_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_ps(0., 1., 2., 3.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_min_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_ps(0., 1., 2., 3.);
         assert_eq_m128(r, e);
     }
@@ -52685,10 +52500,10 @@ mod tests {
     unsafe fn test_mm_maskz_min_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_maskz_min_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_ps(0., 1., 2., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_min_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_ps(0., 1., 2., 3.);
         assert_eq_m128(r, e);
     }
@@ -52697,7 +52512,7 @@ mod tests {
     unsafe fn test_mm_min_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_min_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_pd(0., 1.);
         assert_eq_m128d(r, e);
     }
@@ -52706,10 +52521,10 @@ mod tests {
     unsafe fn test_mm_mask_min_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_mask_min_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_pd(0., 1.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_min_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_pd(0., 1.);
         assert_eq_m128d(r, e);
     }
@@ -52718,10 +52533,10 @@ mod tests {
     unsafe fn test_mm_maskz_min_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_maskz_min_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_pd(0., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_min_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_pd(0., 1.);
         assert_eq_m128d(r, e);
     }
@@ -52804,7 +52619,7 @@ mod tests {
     unsafe fn test_mm_getexp_round_ss() {
         let a = _mm_set1_ps(2.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_getexp_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_ps(2., 2., 2., 1.);
         assert_eq_m128(r, e);
     }
@@ -52813,10 +52628,10 @@ mod tests {
     unsafe fn test_mm_mask_getexp_round_ss() {
         let a = _mm_set1_ps(2.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_mask_getexp_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_ps(2., 2., 2., 2.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_getexp_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_ps(2., 2., 2., 1.);
         assert_eq_m128(r, e);
     }
@@ -52825,10 +52640,10 @@ mod tests {
     unsafe fn test_mm_maskz_getexp_round_ss() {
         let a = _mm_set1_ps(2.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_maskz_getexp_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_ps(2., 2., 2., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_getexp_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_ps(2., 2., 2., 1.);
         assert_eq_m128(r, e);
     }
@@ -52837,7 +52652,7 @@ mod tests {
     unsafe fn test_mm_getexp_round_sd() {
         let a = _mm_set1_pd(2.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_getexp_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_pd(2., 1.);
         assert_eq_m128d(r, e);
     }
@@ -52846,10 +52661,10 @@ mod tests {
     unsafe fn test_mm_mask_getexp_round_sd() {
         let a = _mm_set1_pd(2.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_mask_getexp_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_pd(2., 2.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_getexp_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_pd(2., 1.);
         assert_eq_m128d(r, e);
     }
@@ -52858,10 +52673,10 @@ mod tests {
     unsafe fn test_mm_maskz_getexp_round_sd() {
         let a = _mm_set1_pd(2.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_maskz_getexp_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_pd(2., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_getexp_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_pd(2., 1.);
         assert_eq_m128d(r, e);
     }
@@ -53804,7 +53619,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundss_sd() {
         let a = _mm_set_pd(6., -7.5);
         let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_pd(6., -1.5);
         assert_eq_m128d(r, e);
     }
@@ -53813,9 +53628,9 @@ mod tests {
     unsafe fn test_mm_mask_cvt_roundss_sd() {
         let a = _mm_set_pd(6., -7.5);
         let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_mask_cvt_roundss_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         assert_eq_m128d(r, a);
-        let r = _mm_mask_cvt_roundss_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_pd(6., -1.5);
         assert_eq_m128d(r, e);
     }
@@ -53824,10 +53639,10 @@ mod tests {
     unsafe fn test_mm_maskz_cvt_roundss_sd() {
         let a = _mm_set_pd(6., -7.5);
         let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_maskz_cvt_roundss_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_pd(6., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_cvt_roundss_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_pd(6., -1.5);
         assert_eq_m128d(r, e);
     }
@@ -53996,7 +53811,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_si32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_si32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_si32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i32 = -2;
         assert_eq!(r, e);
     }
@@ -54004,7 +53819,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_i32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_i32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_i32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i32 = -2;
         assert_eq!(r, e);
     }
@@ -54012,7 +53827,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_u32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_u32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: u32 = u32::MAX;
         assert_eq!(r, e);
     }
@@ -54036,7 +53851,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_si32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_si32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_si32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i32 = -2;
         assert_eq!(r, e);
     }
@@ -54044,7 +53859,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_i32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_i32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_i32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i32 = -2;
         assert_eq!(r, e);
     }
@@ -54052,7 +53867,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_u32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_u32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_u32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: u32 = u32::MAX;
         assert_eq!(r, e);
     }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index ae6202bc73..2db8a430d4 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -5090,7 +5090,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvtt_roundpd_epi32(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(a);
         let e = _mm256_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7);
         assert_eq_m256i(r, e);
     }
@@ -5099,9 +5099,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvtt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm256_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundpd_epi32(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtt_roundpd_epi32(src, 0b00001111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -5109,9 +5109,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvtt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvtt_roundpd_epi32(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtt_roundpd_epi32(0b00001111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -5119,7 +5119,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvtt_roundpd_epu32(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1);
         assert_eq_m256i(r, e);
     }
@@ -5128,9 +5128,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvtt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm256_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundpd_epu32(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtt_roundpd_epu32(src, 0b00001111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -5138,9 +5138,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvtt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvtt_roundpd_epu32(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtt_roundpd_epu32(0b00001111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -6259,7 +6259,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundps_pd() {
         let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvt_roundps_pd(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(a);
         let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         assert_eq_m512d(r, e);
     }
@@ -6268,9 +6268,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundps_pd() {
         let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm512_set1_pd(0.);
-        let r = _mm512_mask_cvt_roundps_pd(src, 0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
         assert_eq_m512d(r, src);
-        let r = _mm512_mask_cvt_roundps_pd(src, 0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
         let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
         assert_eq_m512d(r, e);
     }
@@ -6278,9 +6278,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundps_pd() {
         let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvt_roundps_pd(0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(0, a);
         assert_eq_m512d(r, _mm512_setzero_pd());
-        let r = _mm512_maskz_cvt_roundps_pd(0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
         let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
         assert_eq_m512d(r, e);
     }

From 01945c1f74691fd5e6f38c04a3e8acf942a909a5 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 17:02:33 +0000
Subject: [PATCH 003/123] shuffle_epi32

---
 crates/core_arch/src/x86/avx512f.rs | 137 +++++++++-------------------
 1 file changed, 44 insertions(+), 93 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index bcd826d700..c50bd73360 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -21523,75 +21523,32 @@ pub unsafe fn _mm_mask2_permutex2var_pd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_epi32&expand=5150)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] //should be vpshufd, but generate vpermilps
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_shuffle_epi32(a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i32x16();
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                a,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
-                1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
-                2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
-                _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
-                1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
-                2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
-                _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
-                1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
-                2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
-                _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
-            }
-        };
-    }
-    let r: i32x16 = match imm8 & 0x3 {
-        0 => shuffle1!(0, 4, 8, 12),
-        1 => shuffle1!(1, 5, 9, 13),
-        2 => shuffle1!(2, 6, 10, 14),
-        _ => shuffle1!(3, 7, 11, 15),
-    };
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 9))] //should be vpshufd
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r: i32x16 = simd_shuffle16(
+        a.as_i32x16(),
+        a.as_i32x16(),
+        [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            (MASK as u32 >> 4) & 0b11,
+            (MASK as u32 >> 6) & 0b11,
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
+            (MASK as u32 & 0b11) + 8,
+            ((MASK as u32 >> 2) & 0b11) + 8,
+            ((MASK as u32 >> 4) & 0b11) + 8,
+            ((MASK as u32 >> 6) & 0b11) + 8,
+            (MASK as u32 & 0b11) + 12,
+            ((MASK as u32 >> 2) & 0b11) + 12,
+            ((MASK as u32 >> 4) & 0b11) + 12,
+            ((MASK as u32 >> 6) & 0b11) + 12,
+        ],
+    );
     transmute(r)
 }
 
@@ -21600,20 +21557,15 @@ pub unsafe fn _mm512_shuffle_epi32(a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5148)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_shuffle_epi32(
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
-    imm8: _MM_PERM_ENUM,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_epi32(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_epi32::<MASK>(a);
     transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
 }
 
@@ -21622,15 +21574,14 @@ pub unsafe fn _mm512_mask_shuffle_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5149)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_shuffle_epi32(k: __mmask16, a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_epi32(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_epi32::<MASK>(a);
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
 }
@@ -47705,7 +47656,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_shuffle_epi32() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm512_shuffle_epi32(a, _MM_PERM_AADD);
+        let r = _mm512_shuffle_epi32::<_MM_PERM_AADD>(a);
         let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
         assert_eq_m512i(r, e);
     }
@@ -47713,9 +47664,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_shuffle_epi32() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm512_mask_shuffle_epi32(a, 0, a, _MM_PERM_AADD);
+        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_epi32(a, 0b11111111_11111111, a, _MM_PERM_AADD);
+        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111_11111111, a);
         let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
         assert_eq_m512i(r, e);
     }
@@ -47723,9 +47674,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_shuffle_epi32() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm512_maskz_shuffle_epi32(0, a, _MM_PERM_AADD);
+        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_epi32(0b00000000_11111111, a, _MM_PERM_AADD);
+        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00000000_11111111, a);
         let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }

From bfdba0503b465a62430d5330c3a511b9c67c11e5 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 18:07:40 +0000
Subject: [PATCH 004/123] mm256_shuffle_epi32

---
 crates/core_arch/src/x86/avx2.rs    | 85 ++++++-----------------------
 crates/core_arch/src/x86/avx512f.rs | 40 ++++++--------
 2 files changed, 35 insertions(+), 90 deletions(-)

diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index e1fa8bc9b9..ae15fc6db6 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -2642,74 +2642,25 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
-#[rustc_args_required_const(1)]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 9))]
+#[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
-    // simd_shuffleX requires that its selector parameter be made up of
-    // constant values, but we can't enforce that here. In spirit, we need
-    // to write a `match` on all possible values of a byte, and for each value,
-    // hard-code the correct `simd_shuffleX` call using only constants. We
-    // then hope for LLVM to do the rest.
-    //
-    // Of course, that's... awful. So we try to use macros to do it for us.
-    let imm8 = (imm8 & 0xFF) as u8;
-
-    let a = a.as_i32x8();
-    macro_rules! shuffle_done {
-        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
-            simd_shuffle8(
-                a,
-                a,
-                [
-                    $x01,
-                    $x23,
-                    $x45,
-                    $x67,
-                    4 + $x01,
-                    4 + $x23,
-                    4 + $x45,
-                    4 + $x67,
-                ],
-            )
-        };
-    }
-    macro_rules! shuffle_x67 {
-        ($x01:expr, $x23:expr, $x45:expr) => {
-            match (imm8 >> 6) & 0b11 {
-                0b00 => shuffle_done!($x01, $x23, $x45, 0),
-                0b01 => shuffle_done!($x01, $x23, $x45, 1),
-                0b10 => shuffle_done!($x01, $x23, $x45, 2),
-                _ => shuffle_done!($x01, $x23, $x45, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x45 {
-        ($x01:expr, $x23:expr) => {
-            match (imm8 >> 4) & 0b11 {
-                0b00 => shuffle_x67!($x01, $x23, 0),
-                0b01 => shuffle_x67!($x01, $x23, 1),
-                0b10 => shuffle_x67!($x01, $x23, 2),
-                _ => shuffle_x67!($x01, $x23, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x23 {
-        ($x01:expr) => {
-            match (imm8 >> 2) & 0b11 {
-                0b00 => shuffle_x45!($x01, 0),
-                0b01 => shuffle_x45!($x01, 1),
-                0b10 => shuffle_x45!($x01, 2),
-                _ => shuffle_x45!($x01, 3),
-            }
-        };
-    }
-    let r: i32x8 = match imm8 & 0b11 {
-        0b00 => shuffle_x23!(0),
-        0b01 => shuffle_x23!(1),
-        0b10 => shuffle_x23!(2),
-        _ => shuffle_x23!(3),
-    };
+pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r: i32x8 = simd_shuffle8(
+        a.as_i32x8(),
+        a.as_i32x8(),
+        [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            (MASK as u32 >> 4) & 0b11,
+            (MASK as u32 >> 6) & 0b11,
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
+        ],
+    );
     transmute(r)
 }
 
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index c50bd73360..befa3047c6 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -21591,20 +21591,15 @@ pub unsafe fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_epi32&expand=5145)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_shuffle_epi32(
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
-    imm8: _MM_PERM_ENUM,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_epi32(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_epi32::<MASK>(a);
     transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
 }
 
@@ -21613,15 +21608,14 @@ pub unsafe fn _mm256_mask_shuffle_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_epi32&expand=5146)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_shuffle_epi32(k: __mmask8, a: __m256i, imm8: _MM_PERM_ENUM) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_epi32(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_epi32::<MASK>(a);
     let zero = _mm256_setzero_si256().as_i32x8();
     transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
 }
@@ -47684,9 +47678,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_shuffle_epi32() {
         let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm256_mask_shuffle_epi32(a, 0, a, _MM_PERM_AADD);
+        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shuffle_epi32(a, 0b11111111, a, _MM_PERM_AADD);
+        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111, a);
         let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
         assert_eq_m256i(r, e);
     }
@@ -47694,9 +47688,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_shuffle_epi32() {
         let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm256_maskz_shuffle_epi32(0, a, _MM_PERM_AADD);
+        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shuffle_epi32(0b11111111, a, _MM_PERM_AADD);
+        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b11111111, a);
         let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
         assert_eq_m256i(r, e);
     }

From 9f45c278806aeed6d04fa8159d45d62791752f35 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 20:48:07 +0000
Subject: [PATCH 005/123] mm512_srai_epi32

---
 crates/core_arch/src/x86/avx512f.rs | 100 ++++++++++++----------------
 crates/core_arch/src/x86/macros.rs  |  16 +++++
 2 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index befa3047c6..8f3c80e113 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18338,16 +18338,12 @@ pub unsafe fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srai_epi32&expand=5436)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_srai_epi32(a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsraid(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vpsraid(a, IMM8);
     transmute(r)
 }
 
@@ -18356,17 +18352,17 @@ pub unsafe fn _mm512_srai_epi32(a: __m512i, imm8: u32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srai_epi32&expand=5434)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_srai_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsraid(a, $imm8)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    let r = vpsraid(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18374,18 +18370,14 @@ pub unsafe fn _mm512_mask_srai_epi32(src: __m512i, k: __mmask16, a: __m512i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srai_epi32&expand=5435)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_srai_epi32(k: __mmask16, a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsraid(a, $imm8)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+    let r = vpsraid(a, IMM8);
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21625,20 +21617,15 @@ pub unsafe fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_epi32&expand=5142)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_shuffle_epi32(
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
-    imm8: _MM_PERM_ENUM,
 ) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_shuffle_epi32::<$imm8>(a)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm_shuffle_epi32::<MASK>(a);
     transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
 }
 
@@ -21647,15 +21634,14 @@ pub unsafe fn _mm_mask_shuffle_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_epi32&expand=5143)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_shuffle_epi32(k: __mmask8, a: __m128i, imm8: _MM_PERM_ENUM) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_shuffle_epi32::<$imm8>(a)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(MASK);
+    let r = _mm_shuffle_epi32::<MASK>(a);
     let zero = _mm_setzero_si128().as_i32x4();
     transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
 }
@@ -46913,7 +46899,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_srai_epi32() {
         let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -15);
-        let r = _mm512_srai_epi32(a, 2);
+        let r = _mm512_srai_epi32::<2>(a);
         let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4);
         assert_eq_m512i(r, e);
     }
@@ -46921,9 +46907,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_srai_epi32() {
         let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
-        let r = _mm512_mask_srai_epi32(a, 0, a, 2);
+        let r = _mm512_mask_srai_epi32::<2>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_srai_epi32(a, 0b11111111_11111111, a, 2);
+        let r = _mm512_mask_srai_epi32::<2>(a, 0b11111111_11111111, a);
         let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
         assert_eq_m512i(r, e);
     }
@@ -46931,9 +46917,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_srai_epi32() {
         let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
-        let r = _mm512_maskz_srai_epi32(0, a, 2);
+        let r = _mm512_maskz_srai_epi32::<2>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srai_epi32(0b00000000_11111111, a, 2);
+        let r = _mm512_maskz_srai_epi32::<2>(0b00000000_11111111, a);
         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
         assert_eq_m512i(r, e);
     }
@@ -47698,9 +47684,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_shuffle_epi32() {
         let a = _mm_set_epi32(1, 4, 5, 8);
-        let r = _mm_mask_shuffle_epi32(a, 0, a, _MM_PERM_AADD);
+        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shuffle_epi32(a, 0b00001111, a, _MM_PERM_AADD);
+        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b00001111, a);
         let e = _mm_set_epi32(8, 8, 1, 1);
         assert_eq_m128i(r, e);
     }
@@ -47708,9 +47694,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_shuffle_epi32() {
         let a = _mm_set_epi32(1, 4, 5, 8);
-        let r = _mm_maskz_shuffle_epi32(0, a, _MM_PERM_AADD);
+        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shuffle_epi32(0b00001111, a, _MM_PERM_AADD);
+        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00001111, a);
         let e = _mm_set_epi32(8, 8, 1, 1);
         assert_eq_m128i(r, e);
     }
diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index e659ac3da8..ecb7085d18 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -32,6 +32,22 @@ macro_rules! static_assert_sae {
     };
 }
 
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// out of `bits`-bit range.
+pub(crate) struct ValidateConstImmU<const IMM: u32, const BITS: i32>;
+impl<const IMM: u32, const BITS: i32> ValidateConstImmU<IMM, BITS> {
+    pub(crate) const VALID: () = {
+        let _ = 1 / ((IMM < (1 << BITS)) as usize);
+    };
+}
+
+#[allow(unused)]
+macro_rules! static_assert_imm8u {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86::macros::ValidateConstImmU::<$imm, 8>::VALID;
+    };
+}
+
 macro_rules! constify_imm6 {
     ($imm8:expr, $expand:ident) => {
         #[allow(overflowing_literals)]

From bc193cdae13d63e375c81f73377b4b18dcdfd8c9 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 21:01:18 +0000
Subject: [PATCH 006/123] fix macro

---
 crates/core_arch/src/x86/macros.rs | 40 ------------------------------
 1 file changed, 40 deletions(-)

diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index ecb7085d18..1c02de24a7 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -48,46 +48,6 @@ macro_rules! static_assert_imm8u {
     };
 }
 
-macro_rules! constify_imm6 {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1_1111 {
-            0 => $expand!(0),
-            1 => $expand!(1),
-            2 => $expand!(2),
-            3 => $expand!(3),
-            4 => $expand!(4),
-            5 => $expand!(5),
-            6 => $expand!(6),
-            7 => $expand!(7),
-            8 => $expand!(8),
-            9 => $expand!(9),
-            10 => $expand!(10),
-            11 => $expand!(11),
-            12 => $expand!(12),
-            13 => $expand!(13),
-            14 => $expand!(14),
-            15 => $expand!(15),
-            16 => $expand!(16),
-            17 => $expand!(17),
-            18 => $expand!(18),
-            19 => $expand!(19),
-            20 => $expand!(20),
-            21 => $expand!(21),
-            22 => $expand!(22),
-            23 => $expand!(23),
-            24 => $expand!(24),
-            25 => $expand!(25),
-            26 => $expand!(26),
-            27 => $expand!(27),
-            28 => $expand!(28),
-            29 => $expand!(29),
-            30 => $expand!(30),
-            _ => $expand!(31),
-        }
-    };
-}
-
 #[allow(unused_macros)]
 macro_rules! constify_imm4 {
     ($imm8:expr, $expand:ident) => {

From 29debd5622d4c8f7b4331471e6ef4f73387e70d9 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 23:09:01 +0000
Subject: [PATCH 007/123] shuffle_i32x4

---
 crates/core_arch/src/x86/avx512f.rs | 223 +++++++++-------------------
 1 file changed, 72 insertions(+), 151 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 8f3c80e113..cdc3f2d003 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -21909,78 +21909,34 @@ pub unsafe fn _mm_maskz_shuffle_pd<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32&expand=5177)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10010101))] //should be vshufi32x4
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shuffle_i32x4(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10010101))] //should be vshufi32x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
     let a = a.as_i32x16();
     let b = b.as_i32x16();
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                b,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
-                1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
-                2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
-                _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
-                1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
-                2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
-                _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
-            }
-        };
-    }
-    let r: i32x16 = match imm8 & 0x3 {
-        0 => shuffle1!(0, 1, 2, 3),
-        1 => shuffle1!(4, 5, 6, 7),
-        2 => shuffle1!(8, 9, 10, 11),
-        _ => shuffle1!(12, 13, 14, 15),
-    };
-
+    let r: i32x16 = simd_shuffle16(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b11) * 4 + 0,
+            (MASK as u32 & 0b11) * 4 + 1,
+            (MASK as u32 & 0b11) * 4 + 2,
+            (MASK as u32 & 0b11) * 4 + 3,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+        ],
+    );
     transmute(r)
 }
 
@@ -21989,21 +21945,15 @@ pub unsafe fn _mm512_shuffle_i32x4(a: __m512i, b: __m512i, imm8: i32) -> __m512i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x&expand=5175)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10110101))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shuffle_i32x4(
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10110101))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_i32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm512_shuffle_i32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
 }
 
@@ -22012,20 +21962,14 @@ pub unsafe fn _mm512_mask_shuffle_i32x4(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32&expand=5176)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10110101))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shuffle_i32x4(
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10110101))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
     k: __mmask16,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_i32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm512_shuffle_i32x4::<MASK>(a, b);
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
 }
@@ -22035,39 +21979,26 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i32x4&expand=5174)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshufi32x4
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shuffle_i32x4(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b1001))] //should be vshufi32x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
     let a = a.as_i32x8();
     let b = b.as_i32x8();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr, $c: expr, $d: expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
-                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
-            }
-        };
-    }
-    let r: i32x8 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1, 2, 3),
-        _ => shuffle1!(4, 5, 6, 7),
-    };
+    let r: i32x8 = simd_shuffle8(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b1) * 4 + 0,
+            (MASK as u32 & 0b1) * 4 + 1,
+            (MASK as u32 & 0b1) * 4 + 2,
+            (MASK as u32 & 0b1) * 4 + 3,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+        ],
+    );
     transmute(r)
 }
 
@@ -22076,21 +22007,16 @@ pub unsafe fn _mm256_shuffle_i32x4(a: __m256i, b: __m256i, imm8: i32) -> __m256i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i32x4&expand=5172)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b11))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shuffle_i32x4(
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b1101))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_i32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_i32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
 }
 
@@ -22099,20 +22025,15 @@ pub unsafe fn _mm256_mask_shuffle_i32x4(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i32x4&expand=5173)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b11))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shuffle_i32x4(
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b1101))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_i32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_i32x4::<MASK>(a, b);
     let zero = _mm256_setzero_si256().as_i32x8();
     transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
 }
@@ -47798,7 +47719,7 @@ mod tests {
     unsafe fn test_mm512_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_shuffle_i32x4(a, b, 0b0000);
+        let r = _mm512_shuffle_i32x4::<0b0000>(a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
         assert_eq_m512i(r, e);
     }
@@ -47807,9 +47728,9 @@ mod tests {
     unsafe fn test_mm512_mask_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_mask_shuffle_i32x4(a, 0, a, b, 0b0000);
+        let r = _mm512_mask_shuffle_i32x4::<0b0000>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_i32x4(a, 0b11111111_11111111, a, b, 0b0000);
+        let r = _mm512_mask_shuffle_i32x4::<0b0000>(a, 0b11111111_11111111, a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
         assert_eq_m512i(r, e);
     }
@@ -47818,9 +47739,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_maskz_shuffle_i32x4(0, a, b, 0b0000);
+        let r = _mm512_maskz_shuffle_i32x4::<0b0000>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_i32x4(0b00000000_11111111, a, b, 0b0000);
+        let r = _mm512_maskz_shuffle_i32x4::<0b0000>(0b00000000_11111111, a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -47829,7 +47750,7 @@ mod tests {
     unsafe fn test_mm256_shuffle_i32x4() {
         let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm256_shuffle_i32x4(a, b, 0b00);
+        let r = _mm256_shuffle_i32x4::<0b00>(a, b);
         let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
         assert_eq_m256i(r, e);
     }
@@ -47838,9 +47759,9 @@ mod tests {
     unsafe fn test_mm256_mask_shuffle_i32x4() {
         let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm256_mask_shuffle_i32x4(a, 0, a, b, 0b00);
+        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shuffle_i32x4(a, 0b11111111, a, b, 0b00);
+        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0b11111111, a, b);
         let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
         assert_eq_m256i(r, e);
     }
@@ -47849,9 +47770,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shuffle_i32x4() {
         let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm256_maskz_shuffle_i32x4(0, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shuffle_i32x4(0b11111111, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0b11111111, a, b);
         let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
         assert_eq_m256i(r, e);
     }

From 2bef43e07032f0b2d6b5c2ea14e068676a856492 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 23:44:49 +0000
Subject: [PATCH 008/123] shuffle_f32x4

---
 crates/core_arch/src/x86/avx512f.rs | 254 ++++++++++------------------
 1 file changed, 94 insertions(+), 160 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index cdc3f2d003..b5d49b8677 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -21909,7 +21909,7 @@ pub unsafe fn _mm_maskz_shuffle_pd<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32&expand=5177)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10010101))] //should be vshufi32x4
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_01_01_01))] //should be vshufi32x4
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
     static_assert_imm8!(MASK);
@@ -21945,7 +21945,7 @@ pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x&expand=5175)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10110101))]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
     src: __m512i,
@@ -21962,7 +21962,7 @@ pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32&expand=5176)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10110101))]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
     k: __mmask16,
@@ -21979,7 +21979,7 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i32x4&expand=5174)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b1001))] //should be vshufi32x4
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b11))] //should be vshufi32x4
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_imm8!(MASK);
@@ -22007,7 +22007,7 @@ pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i32x4&expand=5172)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b1101))]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
     src: __m256i,
@@ -22025,7 +22025,7 @@ pub unsafe fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i32x4&expand=5173)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b1101))]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(
     k: __mmask8,
@@ -22234,75 +22234,35 @@ pub unsafe fn _mm256_maskz_shuffle_i64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f32x4&expand=5165)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] //should be vshuff32x4, but generate vshuff64x2
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shuffle_f32x4(a: __m512, b: __m512, imm8: i32) -> __m512 {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                b,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
-                1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
-                2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
-                _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
-                1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
-                2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
-                _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
-            }
-        };
-    }
-    match imm8 & 0x3 {
-        0 => shuffle1!(0, 1, 2, 3),
-        1 => shuffle1!(4, 5, 6, 7),
-        2 => shuffle1!(8, 9, 10, 11),
-        _ => shuffle1!(12, 13, 14, 15),
-    }
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b1011))] //should be vshuff32x4, but generate vshuff64x2
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_imm8!(MASK);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r: f32x16 = simd_shuffle16(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b11) * 4 + 0,
+            (MASK as u32 & 0b11) * 4 + 1,
+            (MASK as u32 & 0b11) * 4 + 2,
+            (MASK as u32 & 0b11) * 4 + 3,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+        ],
+    );
+    transmute(r)
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22310,21 +22270,16 @@ pub unsafe fn _mm512_shuffle_f32x4(a: __m512, b: __m512, imm8: i32) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32&expand=5163)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shuffle_f32x4(
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_f32x4<const MASK: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
-    imm8: i32,
 ) -> __m512 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_f32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
 }
 
@@ -22333,15 +22288,15 @@ pub unsafe fn _mm512_mask_shuffle_f32x4(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32&expand=5164)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_f32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_f32x4<const MASK: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f32x4::<MASK>(a, b);
     let zero = _mm512_setzero_ps().as_f32x16();
     transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
 }
@@ -22351,40 +22306,26 @@ pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_f32x4&expand=5162)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshuff32x4
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shuffle_f32x4(a: __m256, b: __m256, imm8: i32) -> __m256 {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff32x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_imm8!(MASK);
     let a = a.as_f32x8();
     let b = b.as_f32x8();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr, $c: expr, $d: expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
-                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
-            }
-        };
-    }
-    let r: f32x8 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1, 2, 3),
-        _ => shuffle1!(4, 5, 6, 7),
-    };
-
+    let r: f32x8 = simd_shuffle8(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b1) * 4 + 0,
+            (MASK as u32 & 0b1) * 4 + 1,
+            (MASK as u32 & 0b1) * 4 + 2,
+            (MASK as u32 & 0b1) * 4 + 3,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+        ],
+    );
     transmute(r)
 }
 
@@ -22393,21 +22334,15 @@ pub unsafe fn _mm256_shuffle_f32x4(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_f32x4&expand=5160)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b11))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shuffle_f32x4(
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
     src: __m256,
     k: __mmask8,
     a: __m256,
     b: __m256,
-    imm8: i32,
 ) -> __m256 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_f32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm256_shuffle_f32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
 }
 
@@ -22416,15 +22351,14 @@ pub unsafe fn _mm256_mask_shuffle_f32x4(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_f32x4&expand=5161)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b11))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shuffle_f32x4(k: __mmask8, a: __m256, b: __m256, imm8: i32) -> __m256 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_f32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    let r = _mm256_shuffle_f32x4::<MASK>(a, b);
     let zero = _mm256_setzero_ps().as_f32x8();
     transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
 }
@@ -47719,7 +47653,7 @@ mod tests {
     unsafe fn test_mm512_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_shuffle_i32x4::<0b0000>(a, b);
+        let r = _mm512_shuffle_i32x4::<0b00_00_00_00>(a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
         assert_eq_m512i(r, e);
     }
@@ -47728,9 +47662,9 @@ mod tests {
     unsafe fn test_mm512_mask_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_mask_shuffle_i32x4::<0b0000>(a, 0, a, b);
+        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_i32x4::<0b0000>(a, 0b11111111_11111111, a, b);
+        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
         assert_eq_m512i(r, e);
     }
@@ -47739,9 +47673,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_maskz_shuffle_i32x4::<0b0000>(0, a, b);
+        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_i32x4::<0b0000>(0b00000000_11111111, a, b);
+        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -47785,7 +47719,7 @@ mod tests {
         let b = _mm512_setr_ps(
             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
         );
-        let r = _mm512_shuffle_f32x4(a, b, 0b00000000);
+        let r = _mm512_shuffle_f32x4::<0b00_00_00_00>(a, b);
         let e = _mm512_setr_ps(
             1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
         );
@@ -47800,9 +47734,9 @@ mod tests {
         let b = _mm512_setr_ps(
             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
         );
-        let r = _mm512_mask_shuffle_f32x4(a, 0, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0, a, b);
         assert_eq_m512(r, a);
-        let r = _mm512_mask_shuffle_f32x4(a, 0b11111111_11111111, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
         let e = _mm512_setr_ps(
             1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
         );
@@ -47817,9 +47751,9 @@ mod tests {
         let b = _mm512_setr_ps(
             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
         );
-        let r = _mm512_maskz_shuffle_f32x4(0, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0, a, b);
         assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_shuffle_f32x4(0b00000000_11111111, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
         let e = _mm512_setr_ps(
             1., 4., 5., 8., 1., 4., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
         );
@@ -47830,7 +47764,7 @@ mod tests {
     unsafe fn test_mm256_shuffle_f32x4() {
         let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_shuffle_f32x4(a, b, 0b00);
+        let r = _mm256_shuffle_f32x4::<0b00>(a, b);
         let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
         assert_eq_m256(r, e);
     }
@@ -47839,9 +47773,9 @@ mod tests {
     unsafe fn test_mm256_mask_shuffle_f32x4() {
         let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_mask_shuffle_f32x4(a, 0, a, b, 0b00);
+        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0, a, b);
         assert_eq_m256(r, a);
-        let r = _mm256_mask_shuffle_f32x4(a, 0b11111111, a, b, 0b00);
+        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0b11111111, a, b);
         let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
         assert_eq_m256(r, e);
     }
@@ -47850,9 +47784,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shuffle_f32x4() {
         let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_maskz_shuffle_f32x4(0, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0, a, b);
         assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_shuffle_f32x4(0b11111111, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0b11111111, a, b);
         let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
         assert_eq_m256(r, e);
     }

From fa7c938465ae49b8479e959073be38b0496bb465 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 00:18:12 +0000
Subject: [PATCH 009/123] shuffle_i64x2

---
 crates/core_arch/src/x86/avx512f.rs    | 173 ++++++++-----------------
 crates/core_arch/src/x86_64/avx512f.rs |  20 +--
 2 files changed, 61 insertions(+), 132 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index b5d49b8677..c00dbaea21 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -22043,61 +22043,27 @@ pub unsafe fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i64x2&expand=5183)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shuffle_i64x2(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, 8, 9),
-                1 => shuffle3!($a, $b, $e, $f, 10, 11),
-                2 => shuffle3!($a, $b, $e, $f, 12, 13),
-                _ => shuffle3!($a, $b, $e, $f, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, 0, 1),
-                1 => shuffle2!($a, $e, 2, 3),
-                2 => shuffle2!($a, $e, 4, 5),
-                _ => shuffle2!($a, $e, 6, 7),
-            }
-        };
-    }
-    match imm8 & 0x3 {
-        0 => shuffle1!(0, 1),
-        1 => shuffle1!(2, 3),
-        2 => shuffle1!(4, 5),
-        _ => shuffle1!(6, 7),
-    }
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    let r: i64x8 = simd_shuffle8(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b11) * 2 + 0,
+            (MASK as u32 & 0b11) * 2 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+        ],
+    );
+    transmute(r)
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22105,21 +22071,15 @@ pub unsafe fn _mm512_shuffle_i64x2(a: __m512i, b: __m512i, imm8: i32) -> __m512i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i64x&expand=5181)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shuffle_i64x2(
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_i64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm512_shuffle_i64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
 }
 
@@ -22128,20 +22088,14 @@ pub unsafe fn _mm512_mask_shuffle_i64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i64&expand=5182)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shuffle_i64x2(
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(
     k: __mmask8,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_i64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm512_shuffle_i64x2::<MASK>(a, b);
     let zero = _mm512_setzero_si512().as_i64x8();
     transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
 }
@@ -22151,35 +22105,22 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i64x2&expand=5180)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshufi64x2
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shuffle_i64x2(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshufi64x2
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
     let a = a.as_i64x4();
     let b = b.as_i64x4();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr
-        ) => {
-            simd_shuffle4(a, b, [$a, $b, $c, $d])
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, 4, 5),
-                _ => shuffle2!($a, $b, 6, 7),
-            }
-        };
-    }
-    let r: i64x4 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1),
-        _ => shuffle1!(2, 3),
-    };
+    let r: i64x4 = simd_shuffle4(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b1) * 2 + 0,
+            (MASK as u32 & 0b1) * 2 + 1,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+        ],
+    );
     transmute(r)
 }
 
@@ -22188,21 +22129,15 @@ pub unsafe fn _mm256_shuffle_i64x2(a: __m256i, b: __m256i, imm8: i32) -> __m256i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i64x2&expand=5178)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b11))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shuffle_i64x2(
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_i64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm256_shuffle_i64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
 }
 
@@ -22211,20 +22146,14 @@ pub unsafe fn _mm256_mask_shuffle_i64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i64x2&expand=5179)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b11))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shuffle_i64x2(
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_i64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm256_shuffle_i64x2::<MASK>(a, b);
     let zero = _mm256_setzero_si256().as_i64x4();
     transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
 }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 2db8a430d4..6d816b86c3 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -9658,7 +9658,7 @@ mod tests {
     unsafe fn test_mm512_shuffle_i64x2() {
         let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_shuffle_i64x2(a, b, 0b00000000);
+        let r = _mm512_shuffle_i64x2::<0b00_00_00_00>(a, b);
         let e = _mm512_setr_epi64(1, 4, 1, 4, 2, 3, 2, 3);
         assert_eq_m512i(r, e);
     }
@@ -9667,9 +9667,9 @@ mod tests {
     unsafe fn test_mm512_mask_shuffle_i64x2() {
         let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_mask_shuffle_i64x2(a, 0, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_i64x2::<0b00_00_00_00>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_i64x2(a, 0b11111111, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_i64x2::<0b00_00_00_00>(a, 0b11111111, a, b);
         let e = _mm512_setr_epi64(1, 4, 1, 4, 2, 3, 2, 3);
         assert_eq_m512i(r, e);
     }
@@ -9678,9 +9678,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shuffle_i64x2() {
         let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_maskz_shuffle_i64x2(0, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_i64x2::<0b00_00_00_00>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_i64x2(0b00001111, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_i64x2::<0b00_00_00_00>(0b00001111, a, b);
         let e = _mm512_setr_epi64(1, 4, 1, 4, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -9689,7 +9689,7 @@ mod tests {
     unsafe fn test_mm256_shuffle_i64x2() {
         let a = _mm256_set_epi64x(1, 4, 5, 8);
         let b = _mm256_set_epi64x(2, 3, 6, 7);
-        let r = _mm256_shuffle_i64x2(a, b, 0b00);
+        let r = _mm256_shuffle_i64x2::<0b00>(a, b);
         let e = _mm256_set_epi64x(6, 7, 5, 8);
         assert_eq_m256i(r, e);
     }
@@ -9698,9 +9698,9 @@ mod tests {
     unsafe fn test_mm256_mask_shuffle_i64x2() {
         let a = _mm256_set_epi64x(1, 4, 5, 8);
         let b = _mm256_set_epi64x(2, 3, 6, 7);
-        let r = _mm256_mask_shuffle_i64x2(a, 0, a, b, 0b00);
+        let r = _mm256_mask_shuffle_i64x2::<0b00>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shuffle_i64x2(a, 0b00001111, a, b, 0b00);
+        let r = _mm256_mask_shuffle_i64x2::<0b00>(a, 0b00001111, a, b);
         let e = _mm256_set_epi64x(6, 7, 5, 8);
         assert_eq_m256i(r, e);
     }
@@ -9709,9 +9709,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shuffle_i64x2() {
         let a = _mm256_set_epi64x(1, 4, 5, 8);
         let b = _mm256_set_epi64x(2, 3, 6, 7);
-        let r = _mm256_maskz_shuffle_i64x2(0, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_i64x2::<0b00>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shuffle_i64x2(0b00001111, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_i64x2::<0b00>(0b00001111, a, b);
         let e = _mm256_set_epi64x(6, 7, 5, 8);
         assert_eq_m256i(r, e);
     }

From b8c1bd7cedfd211333f8eb02a55d737bae7261bf Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 00:41:22 +0000
Subject: [PATCH 010/123] shuffle_f64x2

---
 crates/core_arch/src/x86/avx512f.rs    | 197 ++++++++++---------------
 crates/core_arch/src/x86_64/avx512f.rs |  20 +--
 2 files changed, 85 insertions(+), 132 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index c00dbaea21..5abe23e093 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -21692,6 +21692,7 @@ pub unsafe fn _mm512_mask_shuffle_ps<const MASK: i32>(
     a: __m512,
     b: __m512,
 ) -> __m512 {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_ps::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
 }
@@ -21708,6 +21709,7 @@ pub unsafe fn _mm512_maskz_shuffle_ps<const MASK: i32>(
     a: __m512,
     b: __m512,
 ) -> __m512 {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_ps::<MASK>(a, b);
     let zero = _mm512_setzero_ps().as_f32x16();
     transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
@@ -21726,6 +21728,7 @@ pub unsafe fn _mm256_mask_shuffle_ps<const MASK: i32>(
     a: __m256,
     b: __m256,
 ) -> __m256 {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_ps::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
 }
@@ -21742,6 +21745,7 @@ pub unsafe fn _mm256_maskz_shuffle_ps<const MASK: i32>(
     a: __m256,
     b: __m256,
 ) -> __m256 {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_ps::<MASK>(a, b);
     let zero = _mm256_setzero_ps().as_f32x8();
     transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
@@ -21760,6 +21764,7 @@ pub unsafe fn _mm_mask_shuffle_ps<const MASK: i32>(
     a: __m128,
     b: __m128,
 ) -> __m128 {
+    static_assert_imm8!(MASK);
     let r = _mm_shuffle_ps::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
 }
@@ -21772,6 +21777,7 @@ pub unsafe fn _mm_mask_shuffle_ps<const MASK: i32>(
 #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_imm8!(MASK);
     let r = _mm_shuffle_ps::<MASK>(a, b);
     let zero = _mm_setzero_ps().as_f32x4();
     transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
@@ -21815,6 +21821,7 @@ pub unsafe fn _mm512_mask_shuffle_pd<const MASK: i32>(
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_pd::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
 }
@@ -21831,6 +21838,7 @@ pub unsafe fn _mm512_maskz_shuffle_pd<const MASK: i32>(
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_pd::<MASK>(a, b);
     let zero = _mm512_setzero_pd().as_f64x8();
     transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
@@ -21849,6 +21857,7 @@ pub unsafe fn _mm256_mask_shuffle_pd<const MASK: i32>(
     a: __m256d,
     b: __m256d,
 ) -> __m256d {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_pd::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
 }
@@ -21865,6 +21874,7 @@ pub unsafe fn _mm256_maskz_shuffle_pd<const MASK: i32>(
     a: __m256d,
     b: __m256d,
 ) -> __m256d {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_pd::<MASK>(a, b);
     let zero = _mm256_setzero_pd().as_f64x4();
     transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
@@ -21883,6 +21893,7 @@ pub unsafe fn _mm_mask_shuffle_pd<const MASK: i32>(
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
+    static_assert_imm8!(MASK);
     let r = _mm_shuffle_pd::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
 }
@@ -21899,6 +21910,7 @@ pub unsafe fn _mm_maskz_shuffle_pd<const MASK: i32>(
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
+    static_assert_imm8!(MASK);
     let r = _mm_shuffle_pd::<MASK>(a, b);
     let zero = _mm_setzero_pd().as_f64x2();
     transmute(simd_select_bitmask(k, r.as_f64x2(), zero))
@@ -21953,6 +21965,7 @@ pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_i32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
 }
@@ -21969,6 +21982,7 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_i32x4::<MASK>(a, b);
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
@@ -22079,6 +22093,7 @@ pub unsafe fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_i64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
 }
@@ -22095,6 +22110,7 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_i64x2::<MASK>(a, b);
     let zero = _mm512_setzero_si512().as_i64x8();
     transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
@@ -22137,6 +22153,7 @@ pub unsafe fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_i64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
 }
@@ -22153,6 +22170,7 @@ pub unsafe fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_i64x2::<MASK>(a, b);
     let zero = _mm256_setzero_si256().as_i64x4();
     transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
@@ -22271,6 +22289,7 @@ pub unsafe fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
     a: __m256,
     b: __m256,
 ) -> __m256 {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_f32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
 }
@@ -22287,6 +22306,7 @@ pub unsafe fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(
     a: __m256,
     b: __m256,
 ) -> __m256 {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_f32x4::<MASK>(a, b);
     let zero = _mm256_setzero_ps().as_f32x8();
     transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
@@ -22297,61 +22317,27 @@ pub unsafe fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f64x2&expand=5171)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shuffle_f64x2(a: __m512d, b: __m512d, imm8: i32) -> __m512d {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, 8, 9),
-                1 => shuffle3!($a, $b, $e, $f, 10, 11),
-                2 => shuffle3!($a, $b, $e, $f, 12, 13),
-                _ => shuffle3!($a, $b, $e, $f, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, 0, 1),
-                1 => shuffle2!($a, $e, 2, 3),
-                2 => shuffle2!($a, $e, 4, 5),
-                _ => shuffle2!($a, $e, 6, 7),
-            }
-        };
-    }
-    match imm8 & 0x3 {
-        0 => shuffle1!(0, 1),
-        1 => shuffle1!(2, 3),
-        2 => shuffle1!(4, 5),
-        _ => shuffle1!(6, 7),
-    }
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_imm8!(MASK);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r: f64x8 = simd_shuffle8(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b11) * 2 + 0,
+            (MASK as u32 & 0b11) * 2 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+        ],
+    );
+    transmute(r)
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22359,21 +22345,16 @@ pub unsafe fn _mm512_shuffle_f64x2(a: __m512d, b: __m512d, imm8: i32) -> __m512d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5169)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shuffle_f64x2(
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_f64x2<const MASK: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
-    imm8: i32,
 ) -> __m512d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_f64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
 }
 
@@ -22382,20 +22363,15 @@ pub unsafe fn _mm512_mask_shuffle_f64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5170)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shuffle_f64x2(
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_f64x2<const MASK: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
-    imm8: i32,
 ) -> __m512d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_f64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f64x2::<MASK>(a, b);
     let zero = _mm512_setzero_pd().as_f64x8();
     transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
 }
@@ -22405,35 +22381,22 @@ pub unsafe fn _mm512_maskz_shuffle_f64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_f64x2&expand=5168)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshuff64x2
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shuffle_f64x2(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff64x2
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_imm8!(MASK);
     let a = a.as_f64x4();
     let b = b.as_f64x4();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr
-        ) => {
-            simd_shuffle4(a, b, [$a, $b, $c, $d])
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, 4, 5),
-                _ => shuffle2!($a, $b, 6, 7),
-            }
-        };
-    }
-    let r: f64x4 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1),
-        _ => shuffle1!(2, 3),
-    };
+    let r: f64x4 = simd_shuffle4(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b1) * 2 + 0,
+            (MASK as u32 & 0b1) * 2 + 1,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+        ],
+    );
     transmute(r)
 }
 
@@ -22442,21 +22405,16 @@ pub unsafe fn _mm256_shuffle_f64x2(a: __m256d, b: __m256d, imm8: i32) -> __m256d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_f64x2&expand=5166)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b11))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shuffle_f64x2(
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_f64x2<const MASK: i32>(
     src: __m256d,
     k: __mmask8,
     a: __m256d,
     b: __m256d,
-    imm8: i32,
 ) -> __m256d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_f64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_f64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
 }
 
@@ -22465,20 +22423,15 @@ pub unsafe fn _mm256_mask_shuffle_f64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_f64x2&expand=5167)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b11))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shuffle_f64x2(
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(
     k: __mmask8,
     a: __m256d,
     b: __m256d,
-    imm8: i32,
 ) -> __m256d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_f64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_f64x2::<MASK>(a, b);
     let zero = _mm256_setzero_pd().as_f64x4();
     transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
 }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 6d816b86c3..9ad35f7166 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -9720,7 +9720,7 @@ mod tests {
     unsafe fn test_mm512_shuffle_f64x2() {
         let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm512_shuffle_f64x2(a, b, 0b00000000);
+        let r = _mm512_shuffle_f64x2::<0b00_00_00_00>(a, b);
         let e = _mm512_setr_pd(1., 4., 1., 4., 2., 3., 2., 3.);
         assert_eq_m512d(r, e);
     }
@@ -9729,9 +9729,9 @@ mod tests {
     unsafe fn test_mm512_mask_shuffle_f64x2() {
         let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm512_mask_shuffle_f64x2(a, 0, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_f64x2::<0b00_00_00_00>(a, 0, a, b);
         assert_eq_m512d(r, a);
-        let r = _mm512_mask_shuffle_f64x2(a, 0b11111111, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_f64x2::<0b00_00_00_00>(a, 0b11111111, a, b);
         let e = _mm512_setr_pd(1., 4., 1., 4., 2., 3., 2., 3.);
         assert_eq_m512d(r, e);
     }
@@ -9740,9 +9740,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shuffle_f64x2() {
         let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm512_maskz_shuffle_f64x2(0, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_f64x2::<0b00_00_00_00>(0, a, b);
         assert_eq_m512d(r, _mm512_setzero_pd());
-        let r = _mm512_maskz_shuffle_f64x2(0b00001111, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_f64x2::<0b00_00_00_00>(0b00001111, a, b);
         let e = _mm512_setr_pd(1., 4., 1., 4., 0., 0., 0., 0.);
         assert_eq_m512d(r, e);
     }
@@ -9751,7 +9751,7 @@ mod tests {
     unsafe fn test_mm256_shuffle_f64x2() {
         let a = _mm256_set_pd(1., 4., 5., 8.);
         let b = _mm256_set_pd(2., 3., 6., 7.);
-        let r = _mm256_shuffle_f64x2(a, b, 0b00);
+        let r = _mm256_shuffle_f64x2::<0b00>(a, b);
         let e = _mm256_set_pd(6., 7., 5., 8.);
         assert_eq_m256d(r, e);
     }
@@ -9760,9 +9760,9 @@ mod tests {
     unsafe fn test_mm256_mask_shuffle_f64x2() {
         let a = _mm256_set_pd(1., 4., 5., 8.);
         let b = _mm256_set_pd(2., 3., 6., 7.);
-        let r = _mm256_mask_shuffle_f64x2(a, 0, a, b, 0b00);
+        let r = _mm256_mask_shuffle_f64x2::<0b00>(a, 0, a, b);
         assert_eq_m256d(r, a);
-        let r = _mm256_mask_shuffle_f64x2(a, 0b00001111, a, b, 0b00);
+        let r = _mm256_mask_shuffle_f64x2::<0b00>(a, 0b00001111, a, b);
         let e = _mm256_set_pd(6., 7., 5., 8.);
         assert_eq_m256d(r, e);
     }
@@ -9771,9 +9771,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shuffle_f64x2() {
         let a = _mm256_set_pd(1., 4., 5., 8.);
         let b = _mm256_set_pd(2., 3., 6., 7.);
-        let r = _mm256_maskz_shuffle_f64x2(0, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_f64x2::<0b00>(0, a, b);
         assert_eq_m256d(r, _mm256_setzero_pd());
-        let r = _mm256_maskz_shuffle_f64x2(0b00001111, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_f64x2::<0b00>(0b00001111, a, b);
         let e = _mm256_set_pd(6., 7., 5., 8.);
         assert_eq_m256d(r, e);
     }

From bc0e28b24eece7c8571585a26d068242c054026c Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 11:33:56 +0000
Subject: [PATCH 011/123] move x86/constify_imm4 macro

---
 crates/core_arch/src/x86/macros.rs | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index 1c02de24a7..bf734974af 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -48,31 +48,6 @@ macro_rules! static_assert_imm8u {
     };
 }
 
-#[allow(unused_macros)]
-macro_rules! constify_imm4 {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1111 {
-            0 => $expand!(0),
-            1 => $expand!(1),
-            2 => $expand!(2),
-            3 => $expand!(3),
-            4 => $expand!(4),
-            5 => $expand!(5),
-            6 => $expand!(6),
-            7 => $expand!(7),
-            8 => $expand!(8),
-            9 => $expand!(9),
-            10 => $expand!(10),
-            11 => $expand!(11),
-            12 => $expand!(12),
-            13 => $expand!(13),
-            14 => $expand!(14),
-            _ => $expand!(15),
-        }
-    };
-}
-
 macro_rules! constify_imm3 {
     ($imm8:expr, $expand:ident) => {
         #[allow(overflowing_literals)]

From 3239cea7982663fea2606387e51952f07efe0c97 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 14:21:46 +0000
Subject: [PATCH 012/123] mm_cvtt_roundss,sd_u64,i64,si64;
 mm_cvt_roundss,sd_u64,i64,si64; mm_cvt_roundu64,i64,si64_ss,sd

---
 crates/core_arch/src/x86_64/avx512f.rs | 270 +++++++++----------------
 1 file changed, 90 insertions(+), 180 deletions(-)

diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 9ad35f7166..43906f7714 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -145,16 +145,11 @@ pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_sd&expand=1313)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2sd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2sd64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2sd64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -169,16 +164,11 @@ pub unsafe fn _mm_cvt_roundi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_sd&expand=1367)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2sd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundsi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2sd64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2sd64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -193,16 +183,11 @@ pub unsafe fn _mm_cvt_roundsi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_ss&expand=1314)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2ss64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2ss64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -217,16 +202,11 @@ pub unsafe fn _mm_cvt_roundi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_sd&expand=1379)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtusi2sd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundu64_sd(a: __m128d, b: u64, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vcvtusi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __m128d {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtusi2sd64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtusi2sd64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -241,16 +221,11 @@ pub unsafe fn _mm_cvt_roundu64_sd(a: __m128d, b: u64, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_ss&expand=1368)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundsi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2ss64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2ss64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -265,16 +240,11 @@ pub unsafe fn _mm_cvt_roundsi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_ss&expand=1380)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtusi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundu64_ss(a: __m128, b: u64, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m128 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtusi2ss64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtusi2ss64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -289,16 +259,11 @@ pub unsafe fn _mm_cvt_roundu64_ss(a: __m128, b: u64, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_si64&expand=1360)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_si64(a: __m128d, rounding: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2si64(a, ROUNDING);
     transmute(r)
 }
 
@@ -313,16 +278,11 @@ pub unsafe fn _mm_cvt_roundsd_si64(a: __m128d, rounding: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_i64&expand=1358)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_i64(a: __m128d, rounding: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2si64(a, ROUNDING);
     transmute(r)
 }
 
@@ -337,16 +297,11 @@ pub unsafe fn _mm_cvt_roundsd_i64(a: __m128d, rounding: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_u64&expand=1365)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_u64(a: __m128d, rounding: i32) -> u64 {
+#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2usi64(a, ROUNDING);
     transmute(r)
 }
 
@@ -361,16 +316,11 @@ pub unsafe fn _mm_cvt_roundsd_u64(a: __m128d, rounding: i32) -> u64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_si64&expand=1375)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_si64(a: __m128, rounding: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2si64(a, ROUNDING);
     transmute(r)
 }
 
@@ -385,16 +335,11 @@ pub unsafe fn _mm_cvt_roundss_si64(a: __m128, rounding: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_i64&expand=1370)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_i64(a: __m128, rounding: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2si64(a, ROUNDING);
     transmute(r)
 }
 
@@ -409,16 +354,11 @@ pub unsafe fn _mm_cvt_roundss_i64(a: __m128, rounding: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_u64&expand=1377)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_u64(a: __m128, rounding: i32) -> u64 {
+#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2usi64(a, ROUNDING);
     transmute(r)
 }
 
@@ -428,16 +368,11 @@ pub unsafe fn _mm_cvt_roundss_u64(a: __m128, rounding: i32) -> u64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si64&expand=1931)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_si64(a: __m128d, sae: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2si64(a, SAE);
     transmute(r)
 }
 
@@ -447,16 +382,11 @@ pub unsafe fn _mm_cvtt_roundsd_si64(a: __m128d, sae: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i64&expand=1929)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_i64(a: __m128d, sae: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2si64(a, SAE);
     transmute(r)
 }
 
@@ -466,16 +396,11 @@ pub unsafe fn _mm_cvtt_roundsd_i64(a: __m128d, sae: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_u64&expand=1933)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_u64(a: __m128d, sae: i32) -> u64 {
+#[cfg_attr(test, assert_instr(vcvtsd2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2usi64(a, SAE);
     transmute(r)
 }
 
@@ -485,16 +410,11 @@ pub unsafe fn _mm_cvtt_roundsd_u64(a: __m128d, sae: i32) -> u64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_i64&expand=1935)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_i64(a: __m128, sae: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2si64(a, SAE);
     transmute(r)
 }
 
@@ -504,16 +424,11 @@ pub unsafe fn _mm_cvtt_roundss_i64(a: __m128, sae: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_si64&expand=1937)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_si64(a: __m128, sae: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2si64(a, SAE);
     transmute(r)
 }
 
@@ -523,16 +438,11 @@ pub unsafe fn _mm_cvtt_roundss_si64(a: __m128, sae: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_u64&expand=1939)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_u64(a: __m128, sae: i32) -> u64 {
+#[cfg_attr(test, assert_instr(vcvtss2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2usi64(a, SAE);
     transmute(r)
 }
 
@@ -12197,7 +12107,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundi64_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: i64 = 9;
-        let r = _mm_cvt_roundi64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundi64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
@@ -12206,7 +12116,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundsi64_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: i64 = 9;
-        let r = _mm_cvt_roundsi64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsi64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
@@ -12232,7 +12142,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_si64() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_si64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_si64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i64 = -1;
         assert_eq!(r, e);
     }
@@ -12240,7 +12150,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_i64() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_i64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_i64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i64 = -1;
         assert_eq!(r, e);
     }
@@ -12248,7 +12158,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_u64() {
         let a = _mm_set_pd(1., f64::MAX);
-        let r = _mm_cvt_roundsd_u64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_u64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: u64 = u64::MAX;
         assert_eq!(r, e);
     }
@@ -12264,7 +12174,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_i64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_i64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_i64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i64 = -1;
         assert_eq!(r, e);
     }
@@ -12272,7 +12182,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_si64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_si64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_si64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i64 = -1;
         assert_eq!(r, e);
     }
@@ -12280,7 +12190,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_u64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_u64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_u64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: u64 = u64::MAX;
         assert_eq!(r, e);
     }
@@ -12304,7 +12214,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_i64() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_i64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_i64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i64 = -2;
         assert_eq!(r, e);
     }
@@ -12312,7 +12222,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_si64() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_si64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_si64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i64 = -2;
         assert_eq!(r, e);
     }
@@ -12320,7 +12230,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_u64() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_u64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_u64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: u64 = u64::MAX;
         assert_eq!(r, e);
     }
@@ -12344,7 +12254,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_i64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_i64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_i64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i64 = -2;
         assert_eq!(r, e);
     }
@@ -12352,7 +12262,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_si64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_si64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_si64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i64 = -2;
         assert_eq!(r, e);
     }
@@ -12360,7 +12270,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_u64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_u64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_u64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: u64 = u64::MAX;
         assert_eq!(r, e);
     }
@@ -12395,7 +12305,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundu64_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: u64 = 9;
-        let r = _mm_cvt_roundu64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundu64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
@@ -12404,7 +12314,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundu64_sd() {
         let a = _mm_set_pd(1., -1.5);
         let b: u64 = 9;
-        let r = _mm_cvt_roundu64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundu64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 9.);
         assert_eq_m128d(r, e);
     }
@@ -12413,7 +12323,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundi64_sd() {
         let a = _mm_set_pd(1., -1.5);
         let b: i64 = 9;
-        let r = _mm_cvt_roundi64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundi64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 9.);
         assert_eq_m128d(r, e);
     }
@@ -12422,7 +12332,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundsi64_sd() {
         let a = _mm_set_pd(1., -1.5);
         let b: i64 = 9;
-        let r = _mm_cvt_roundsi64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsi64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 9.);
         assert_eq_m128d(r, e);
     }

From 5a7e50ac6e7c91998eaf52e1bff7956e596e9d93 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 14:41:58 +0000
Subject: [PATCH 013/123] add static_assert

---
 crates/core_arch/src/x86_64/avx512f.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 43906f7714..af62b2112c 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -148,6 +148,7 @@ pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtsi2sd64(a, b, ROUNDING);
     transmute(r)
@@ -167,6 +168,7 @@ pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __
 #[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtsi2sd64(a, b, ROUNDING);
     transmute(r)
@@ -186,6 +188,7 @@ pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> _
 #[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtsi2ss64(a, b, ROUNDING);
     transmute(r)
@@ -205,6 +208,7 @@ pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m
 #[cfg_attr(test, assert_instr(vcvtusi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtusi2sd64(a, b, ROUNDING);
     transmute(r)
@@ -224,6 +228,7 @@ pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __
 #[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtsi2ss64(a, b, ROUNDING);
     transmute(r)
@@ -243,6 +248,7 @@ pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __
 #[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtusi2ss64(a, b, ROUNDING);
     transmute(r)
@@ -262,6 +268,7 @@ pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m
 #[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtsd2si64(a, ROUNDING);
     transmute(r)
@@ -281,6 +288,7 @@ pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtsd2si64(a, ROUNDING);
     transmute(r)
@@ -300,6 +308,7 @@ pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtsd2usi64(a, ROUNDING);
     transmute(r)
@@ -319,6 +328,7 @@ pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtss2si64(a, ROUNDING);
     transmute(r)
@@ -338,6 +348,7 @@ pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtss2si64(a, ROUNDING);
     transmute(r)
@@ -357,6 +368,7 @@ pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtss2usi64(a, ROUNDING);
     transmute(r)
@@ -371,6 +383,7 @@ pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let r = vcvtsd2si64(a, SAE);
     transmute(r)
@@ -385,6 +398,7 @@ pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let r = vcvtsd2si64(a, SAE);
     transmute(r)
@@ -399,6 +413,7 @@ pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtsd2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let r = vcvtsd2usi64(a, SAE);
     transmute(r)
@@ -413,6 +428,7 @@ pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let r = vcvtss2si64(a, SAE);
     transmute(r)
@@ -427,6 +443,7 @@ pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let r = vcvtss2si64(a, SAE);
     transmute(r)
@@ -441,6 +458,7 @@ pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtss2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let r = vcvtss2usi64(a, SAE);
     transmute(r)

From 18b98104c785e9fd56bec2e6203c733f42eddc28 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 15:00:58 +0000
Subject: [PATCH 014/123] fix x86_64/macro

---
 crates/core_arch/src/x86_64/macros.rs | 47 ++++++++++++++-------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/crates/core_arch/src/x86_64/macros.rs b/crates/core_arch/src/x86_64/macros.rs
index e3682d40fe..cafa37dd6f 100644
--- a/crates/core_arch/src/x86_64/macros.rs
+++ b/crates/core_arch/src/x86_64/macros.rs
@@ -1,32 +1,33 @@
 //! Utility macros.
 
-// For round instructions, the only valid values for rounding are 4, 8, 9, 10 and 11.
-// This macro enforces that.
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// not a round number.
+pub(crate) struct ValidateConstRound<const IMM: i32>;
+impl<const IMM: i32> ValidateConstRound<IMM> {
+    pub(crate) const VALID: () = {
+        let _ = 1 / ((IMM == 4 || IMM == 8 || IMM == 9 || IMM == 10 || IMM == 11) as usize);
+    };
+}
+
 #[allow(unused)]
-macro_rules! constify_imm4_round {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1111 {
-            4 => $expand!(4),
-            8 => $expand!(8),
-            9 => $expand!(9),
-            10 => $expand!(10),
-            11 => $expand!(11),
-            _ => panic!("Invalid round value"),
-        }
+macro_rules! static_assert_rounding {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86_64::macros::ValidateConstRound::<$imm>::VALID;
+    };
+}
+
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// not a sae number.
+pub(crate) struct ValidateConstSae<const IMM: i32>;
+impl<const IMM: i32> ValidateConstSae<IMM> {
+    pub(crate) const VALID: () = {
+        let _ = 1 / ((IMM == 4 || IMM == 8) as usize);
     };
 }
 
-// For sae instructions, the only valid values for sae are 4 and 8.
-// This macro enforces that.
 #[allow(unused)]
-macro_rules! constify_imm4_sae {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1111 {
-            4 => $expand!(4),
-            8 => $expand!(8),
-            _ => panic!("Invalid sae value"),
-        }
+macro_rules! static_assert_sae {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86_64::macros::ValidateConstSae::<$imm>::VALID;
     };
 }

From fabb653dde886527a729d784b06aa814758c94ac Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 15:31:00 +0000
Subject: [PATCH 015/123] remove x86/macro imm4_sae,imm4_rounding

---
 crates/core_arch/src/x86/macros.rs | 31 ------------------------------
 1 file changed, 31 deletions(-)

diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index bf734974af..76b87b40f4 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -200,37 +200,6 @@ macro_rules! constify_imm8_gather {
     };
 }
 
-// For round instructions, the only valid values for rounding are 4, 8, 9, 10 and 11.
-// This macro enforces that.
-#[allow(unused)]
-macro_rules! constify_imm4_round {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1111 {
-            4 => $expand!(4),
-            8 => $expand!(8),
-            9 => $expand!(9),
-            10 => $expand!(10),
-            11 => $expand!(11),
-            _ => panic!("Invalid round value"),
-        }
-    };
-}
-
-// For sae instructions, the only valid values for sae are 4 and 8.
-// This macro enforces that.
-#[allow(unused)]
-macro_rules! constify_imm4_sae {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1111 {
-            4 => $expand!(4),
-            8 => $expand!(8),
-            _ => panic!("Invalid sae value"),
-        }
-    };
-}
-
 // Two mantissas parameters.
 // This macro enforces that.
 #[allow(unused)]

From 0ce558fe0cc88e4eff30833dfee937fe80393905 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 16:56:42 +0000
Subject: [PATCH 016/123] shldi,shrdi_epi64,epi32,epi16

---
 crates/core_arch/src/x86/avx512vbmi2.rs | 865 +++++++++++++-----------
 1 file changed, 468 insertions(+), 397 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
index 032bce9176..b7a385dd97 100644
--- a/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -920,14 +920,15 @@ pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m1
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi64&expand=5060)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shldi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shldi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshldvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     ))
 }
 
@@ -936,20 +937,20 @@ pub unsafe fn _mm512_shldi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi64&expand=5058)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shldi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shldi_epi64<const IMM8: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x8 = vpshldvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
 }
@@ -959,14 +960,19 @@ pub unsafe fn _mm512_mask_shldi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi64&expand=5059)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shldi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shldi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x8 = vpshldvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     );
     let zero = _mm512_setzero_si512().as_i64x8();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -977,14 +983,15 @@ pub unsafe fn _mm512_maskz_shldi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi64&expand=5057)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shldi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shldi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshldvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     ))
 }
 
@@ -993,20 +1000,20 @@ pub unsafe fn _mm256_shldi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi64&expand=5055)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shldi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shldi_epi64<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x4 = vpshldvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
 }
@@ -1016,14 +1023,19 @@ pub unsafe fn _mm256_mask_shldi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi64&expand=5056)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shldi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shldi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x4 = vpshldvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     );
     let zero = _mm256_setzero_si256().as_i64x4();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1034,14 +1046,15 @@ pub unsafe fn _mm256_maskz_shldi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi64&expand=5054)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shldi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shldi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshldvq128(
         a.as_i64x2(),
         b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
+        _mm_set1_epi64x(imm8).as_i64x2(),
     ))
 }
 
@@ -1050,21 +1063,17 @@ pub unsafe fn _mm_shldi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi64&expand=5052)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shldi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shldi_epi64<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i64x2 = vpshldvq128(
-        a.as_i64x2(),
-        b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
-    );
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshldvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
     transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
 }
 
@@ -1073,15 +1082,16 @@ pub unsafe fn _mm_mask_shldi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi64&expand=5053)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shldi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i64x2 = vpshldvq128(
-        a.as_i64x2(),
-        b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
-    );
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shldi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshldvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
     let zero = _mm_setzero_si128().as_i64x2();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -1091,14 +1101,14 @@ pub unsafe fn _mm_maskz_shldi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi32&expand=5051)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shldi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shldi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     transmute(vpshldvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     ))
 }
 
@@ -1107,20 +1117,19 @@ pub unsafe fn _mm512_shldi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi32&expand=5049)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shldi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shldi_epi32<const IMM8: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
     let shf: i32x16 = vpshldvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
 }
@@ -1130,14 +1139,18 @@ pub unsafe fn _mm512_mask_shldi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi32&expand=5050)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shldi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shldi_epi32<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let shf: i32x16 = vpshldvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     );
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1148,14 +1161,14 @@ pub unsafe fn _mm512_maskz_shldi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi32&expand=5048)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shldi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shldi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     transmute(vpshldvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     ))
 }
 
@@ -1164,20 +1177,19 @@ pub unsafe fn _mm256_shldi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi32&expand=5046)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shldi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shldi_epi32<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
     let shf: i32x8 = vpshldvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
 }
@@ -1187,14 +1199,18 @@ pub unsafe fn _mm256_mask_shldi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi32&expand=5047)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shldi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shldi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let shf: i32x8 = vpshldvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     );
     let zero = _mm256_setzero_si256().as_i32x8();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1205,14 +1221,14 @@ pub unsafe fn _mm256_maskz_shldi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi32&expand=5045)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shldi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shldi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     transmute(vpshldvd128(
         a.as_i32x4(),
         b.as_i32x4(),
-        _mm_set1_epi32(imm8).as_i32x4(),
+        _mm_set1_epi32(IMM8).as_i32x4(),
     ))
 }
 
@@ -1221,17 +1237,16 @@ pub unsafe fn _mm_shldi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi32&expand=5043)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shldi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shldi_epi32<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
     transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
 }
 
@@ -1240,11 +1255,15 @@ pub unsafe fn _mm_mask_shldi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi32&expand=5044)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shldi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shldi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
     let zero = _mm_setzero_si128().as_i32x4();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -1254,14 +1273,15 @@ pub unsafe fn _mm_maskz_shldi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi16&expand=5042)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shldi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shldi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     transmute(vpshldvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     ))
 }
 
@@ -1270,20 +1290,20 @@ pub unsafe fn _mm512_shldi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi16&expand=5040)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shldi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shldi_epi16<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     let shf: i16x32 = vpshldvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
 }
@@ -1293,14 +1313,19 @@ pub unsafe fn _mm512_mask_shldi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi16&expand=5041)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shldi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shldi_epi16<const IMM8: i32>(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     let shf: i16x32 = vpshldvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     );
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1311,14 +1336,15 @@ pub unsafe fn _mm512_maskz_shldi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi16&expand=5039)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shldi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shldi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     transmute(vpshldvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     ))
 }
 
@@ -1327,20 +1353,20 @@ pub unsafe fn _mm256_shldi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi16&expand=5037)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shldi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shldi_epi16<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     let shf: i16x16 = vpshldvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
 }
@@ -1350,13 +1376,19 @@ pub unsafe fn _mm256_mask_shldi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi16&expand=5038)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shldi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shldi_epi16<const IMM8: i32>(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     let shf: i16x16 = vpshldvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     );
     let zero = _mm256_setzero_si256().as_i16x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1367,13 +1399,15 @@ pub unsafe fn _mm256_maskz_shldi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi16&expand=5036)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shldi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shldi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     transmute(vpshldvw128(
         a.as_i16x8(),
         b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
+        _mm_set1_epi16(imm8).as_i16x8(),
     ))
 }
 
@@ -1382,20 +1416,17 @@ pub unsafe fn _mm_shldi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi16&expand=5034)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shldi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shldi_epi16<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    let shf: i16x8 = vpshldvw128(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
-    );
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshldvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
     transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
 }
 
@@ -1404,14 +1435,16 @@ pub unsafe fn _mm_mask_shldi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi16&expand=5035)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shldi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    let shf: i16x8 = vpshldvw128(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
-    );
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shldi_epi16<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshldvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
     let zero = _mm_setzero_si128().as_i16x8();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -1421,14 +1454,15 @@ pub unsafe fn _mm_maskz_shldi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi64&expand=5114)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shrdi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shrdi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshrdvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     ))
 }
 
@@ -1437,20 +1471,20 @@ pub unsafe fn _mm512_shrdi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi64&expand=5112)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shrdi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shrdi_epi64<const IMM8: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x8 = vpshrdvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
 }
@@ -1460,14 +1494,19 @@ pub unsafe fn _mm512_mask_shrdi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi64&expand=5113)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 255))] //should be vpshrdq
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shrdi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 255))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x8 = vpshrdvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     );
     let zero = _mm512_setzero_si512().as_i64x8();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1478,14 +1517,15 @@ pub unsafe fn _mm512_maskz_shrdi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi64&expand=5111)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shrdi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shrdi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshrdvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     ))
 }
 
@@ -1494,20 +1534,20 @@ pub unsafe fn _mm256_shrdi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi64&expand=5109)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shrdi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shrdi_epi64<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x4 = vpshrdvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
 }
@@ -1517,14 +1557,19 @@ pub unsafe fn _mm256_mask_shrdi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi64&expand=5110)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shrdi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x4 = vpshrdvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     );
     let zero = _mm256_setzero_si256().as_i64x4();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1535,14 +1580,15 @@ pub unsafe fn _mm256_maskz_shrdi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi64&expand=5108)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shrdi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shrdi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshrdvq128(
         a.as_i64x2(),
         b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
+        _mm_set1_epi64x(imm8).as_i64x2(),
     ))
 }
 
@@ -1551,21 +1597,17 @@ pub unsafe fn _mm_shrdi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi64&expand=5106)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shrdi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shrdi_epi64<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i64x2 = vpshrdvq128(
-        a.as_i64x2(),
-        b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
-    );
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshrdvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
     transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
 }
 
@@ -1574,15 +1616,16 @@ pub unsafe fn _mm_mask_shrdi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi64&expand=5107)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shrdi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i64x2 = vpshrdvq128(
-        a.as_i64x2(),
-        b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
-    );
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shrdi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshrdvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
     let zero = _mm_setzero_si128().as_i64x2();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -1592,14 +1635,14 @@ pub unsafe fn _mm_maskz_shrdi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi32&expand=5105)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shrdi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shrdi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     transmute(vpshrdvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     ))
 }
 
@@ -1608,20 +1651,19 @@ pub unsafe fn _mm512_shrdi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi32&expand=5103)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shrdi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shrdi_epi32<const IMM8: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
     let shf: i32x16 = vpshrdvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
 }
@@ -1631,14 +1673,18 @@ pub unsafe fn _mm512_mask_shrdi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi32&expand=5104)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shrdi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let shf: i32x16 = vpshrdvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     );
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1649,14 +1695,14 @@ pub unsafe fn _mm512_maskz_shrdi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi32&expand=5102)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shrdi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shrdi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     transmute(vpshrdvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     ))
 }
 
@@ -1665,20 +1711,19 @@ pub unsafe fn _mm256_shrdi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi32&expand=5100)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shrdi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shrdi_epi32<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
     let shf: i32x8 = vpshrdvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
 }
@@ -1688,14 +1733,18 @@ pub unsafe fn _mm256_mask_shrdi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi32&expand=5101)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shrdi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let shf: i32x8 = vpshrdvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     );
     let zero = _mm256_setzero_si256().as_i32x8();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1706,14 +1755,14 @@ pub unsafe fn _mm256_maskz_shrdi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi32&expand=5099)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shrdi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shrdi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     transmute(vpshrdvd128(
         a.as_i32x4(),
         b.as_i32x4(),
-        _mm_set1_epi32(imm8).as_i32x4(),
+        _mm_set1_epi32(IMM8).as_i32x4(),
     ))
 }
 
@@ -1722,17 +1771,16 @@ pub unsafe fn _mm_shrdi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi32&expand=5097)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shrdi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shrdi_epi32<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
     transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
 }
 
@@ -1741,11 +1789,15 @@ pub unsafe fn _mm_mask_shrdi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi32&expand=5098)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shrdi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shrdi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
     let zero = _mm_setzero_si128().as_i32x4();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -1755,14 +1807,16 @@ pub unsafe fn _mm_maskz_shrdi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi16&expand=5096)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shrdi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shrdi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     assert!(imm8 >= 0 && imm8 <= 255);
     transmute(vpshrdvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     ))
 }
 
@@ -1771,20 +1825,21 @@ pub unsafe fn _mm512_shrdi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi16&expand=5094)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shrdi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shrdi_epi16<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     assert!(imm8 >= 0 && imm8 <= 255);
     let shf: i16x32 = vpshrdvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
 }
@@ -1794,14 +1849,20 @@ pub unsafe fn _mm512_mask_shrdi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi16&expand=5095)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shrdi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     assert!(imm8 >= 0 && imm8 <= 255);
     let shf: i16x32 = vpshrdvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     );
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1812,14 +1873,16 @@ pub unsafe fn _mm512_maskz_shrdi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi16&expand=5093)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shrdi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shrdi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     assert!(imm8 >= 0 && imm8 <= 255);
     transmute(vpshrdvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     ))
 }
 
@@ -1828,20 +1891,21 @@ pub unsafe fn _mm256_shrdi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi16&expand=5091)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shrdi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shrdi_epi16<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     assert!(imm8 >= 0 && imm8 <= 255);
     let shf: i16x16 = vpshrdvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
 }
@@ -1851,13 +1915,19 @@ pub unsafe fn _mm256_mask_shrdi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi16&expand=5092)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shrdi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     let shf: i16x16 = vpshrdvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     );
     let zero = _mm256_setzero_si256().as_i16x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1868,13 +1938,15 @@ pub unsafe fn _mm256_maskz_shrdi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi16&expand=5090)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shrdi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shrdi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     transmute(vpshrdvw128(
         a.as_i16x8(),
         b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
+        _mm_set1_epi16(imm8).as_i16x8(),
     ))
 }
 
@@ -1883,20 +1955,17 @@ pub unsafe fn _mm_shrdi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi16&expand=5088)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shrdi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shrdi_epi16<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    let shf: i16x8 = vpshrdvw128(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
-    );
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshrdvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
     transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
 }
 
@@ -1905,14 +1974,16 @@ pub unsafe fn _mm_mask_shrdi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi16&expand=5089)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shrdi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    let shf: i16x8 = vpshrdvw128(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
-    );
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shrdi_epi16<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshrdvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
     let zero = _mm_setzero_si128().as_i16x8();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -2921,7 +2992,7 @@ mod tests {
     unsafe fn test_mm512_shldi_epi64() {
         let a = _mm512_set1_epi64(1);
         let b = _mm512_set1_epi64(1 << 63);
-        let r = _mm512_shldi_epi64(a, b, 2);
+        let r = _mm512_shldi_epi64::<2>(a, b);
         let e = _mm512_set1_epi64(6);
         assert_eq_m512i(r, e);
     }
@@ -2930,9 +3001,9 @@ mod tests {
     unsafe fn test_mm512_mask_shldi_epi64() {
         let a = _mm512_set1_epi64(1);
         let b = _mm512_set1_epi64(1 << 63);
-        let r = _mm512_mask_shldi_epi64(a, 0, a, b, 2);
+        let r = _mm512_mask_shldi_epi64::<2>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldi_epi64(a, 0b11111111, a, b, 2);
+        let r = _mm512_mask_shldi_epi64::<2>(a, 0b11111111, a, b);
         let e = _mm512_set1_epi64(6);
         assert_eq_m512i(r, e);
     }
@@ -2941,9 +3012,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shldi_epi64() {
         let a = _mm512_set1_epi64(1);
         let b = _mm512_set1_epi64(1 << 63);
-        let r = _mm512_maskz_shldi_epi64(0, a, b, 2);
+        let r = _mm512_maskz_shldi_epi64::<2>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldi_epi64(0b11111111, a, b, 2);
+        let r = _mm512_maskz_shldi_epi64::<2>(0b11111111, a, b);
         let e = _mm512_set1_epi64(6);
         assert_eq_m512i(r, e);
     }
@@ -2952,7 +3023,7 @@ mod tests {
     unsafe fn test_mm256_shldi_epi64() {
         let a = _mm256_set1_epi64x(1);
         let b = _mm256_set1_epi64x(1 << 63);
-        let r = _mm256_shldi_epi64(a, b, 2);
+        let r = _mm256_shldi_epi64::<2>(a, b);
         let e = _mm256_set1_epi64x(6);
         assert_eq_m256i(r, e);
     }
@@ -2961,9 +3032,9 @@ mod tests {
     unsafe fn test_mm256_mask_shldi_epi64() {
         let a = _mm256_set1_epi64x(1);
         let b = _mm256_set1_epi64x(1 << 63);
-        let r = _mm256_mask_shldi_epi64(a, 0, a, b, 2);
+        let r = _mm256_mask_shldi_epi64::<2>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldi_epi64(a, 0b00001111, a, b, 2);
+        let r = _mm256_mask_shldi_epi64::<2>(a, 0b00001111, a, b);
         let e = _mm256_set1_epi64x(6);
         assert_eq_m256i(r, e);
     }
@@ -2972,9 +3043,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shldi_epi64() {
         let a = _mm256_set1_epi64x(1);
         let b = _mm256_set1_epi64x(1 << 63);
-        let r = _mm256_maskz_shldi_epi64(0, a, b, 2);
+        let r = _mm256_maskz_shldi_epi64::<2>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldi_epi64(0b00001111, a, b, 2);
+        let r = _mm256_maskz_shldi_epi64::<2>(0b00001111, a, b);
         let e = _mm256_set1_epi64x(6);
         assert_eq_m256i(r, e);
     }
@@ -2983,7 +3054,7 @@ mod tests {
     unsafe fn test_mm_shldi_epi64() {
         let a = _mm_set1_epi64x(1);
         let b = _mm_set1_epi64x(1 << 63);
-        let r = _mm_shldi_epi64(a, b, 2);
+        let r = _mm_shldi_epi64::<2>(a, b);
         let e = _mm_set1_epi64x(6);
         assert_eq_m128i(r, e);
     }
@@ -2992,9 +3063,9 @@ mod tests {
     unsafe fn test_mm_mask_shldi_epi64() {
         let a = _mm_set1_epi64x(1);
         let b = _mm_set1_epi64x(1 << 63);
-        let r = _mm_mask_shldi_epi64(a, 0, a, b, 2);
+        let r = _mm_mask_shldi_epi64::<2>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shldi_epi64(a, 0b00000011, a, b, 2);
+        let r = _mm_mask_shldi_epi64::<2>(a, 0b00000011, a, b);
         let e = _mm_set1_epi64x(6);
         assert_eq_m128i(r, e);
     }
@@ -3003,9 +3074,9 @@ mod tests {
     unsafe fn test_mm_maskz_shldi_epi64() {
         let a = _mm_set1_epi64x(1);
         let b = _mm_set1_epi64x(1 << 63);
-        let r = _mm_maskz_shldi_epi64(0, a, b, 2);
+        let r = _mm_maskz_shldi_epi64::<2>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldi_epi64(0b00000011, a, b, 2);
+        let r = _mm_maskz_shldi_epi64::<2>(0b00000011, a, b);
         let e = _mm_set1_epi64x(6);
         assert_eq_m128i(r, e);
     }
@@ -3014,7 +3085,7 @@ mod tests {
     unsafe fn test_mm512_shldi_epi32() {
         let a = _mm512_set1_epi32(1);
         let b = _mm512_set1_epi32(1 << 31);
-        let r = _mm512_shldi_epi32(a, b, 2);
+        let r = _mm512_shldi_epi32::<2>(a, b);
         let e = _mm512_set1_epi32(6);
         assert_eq_m512i(r, e);
     }
@@ -3023,9 +3094,9 @@ mod tests {
     unsafe fn test_mm512_mask_shldi_epi32() {
         let a = _mm512_set1_epi32(1);
         let b = _mm512_set1_epi32(1 << 31);
-        let r = _mm512_mask_shldi_epi32(a, 0, a, b, 2);
+        let r = _mm512_mask_shldi_epi32::<2>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldi_epi32(a, 0b11111111_11111111, a, b, 2);
+        let r = _mm512_mask_shldi_epi32::<2>(a, 0b11111111_11111111, a, b);
         let e = _mm512_set1_epi32(6);
         assert_eq_m512i(r, e);
     }
@@ -3034,9 +3105,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shldi_epi32() {
         let a = _mm512_set1_epi32(1);
         let b = _mm512_set1_epi32(1 << 31);
-        let r = _mm512_maskz_shldi_epi32(0, a, b, 2);
+        let r = _mm512_maskz_shldi_epi32::<2>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldi_epi32(0b11111111_11111111, a, b, 2);
+        let r = _mm512_maskz_shldi_epi32::<2>(0b11111111_11111111, a, b);
         let e = _mm512_set1_epi32(6);
         assert_eq_m512i(r, e);
     }
@@ -3045,7 +3116,7 @@ mod tests {
     unsafe fn test_mm256_shldi_epi32() {
         let a = _mm256_set1_epi32(1);
         let b = _mm256_set1_epi32(1 << 31);
-        let r = _mm256_shldi_epi32(a, b, 2);
+        let r = _mm256_shldi_epi32::<2>(a, b);
         let e = _mm256_set1_epi32(6);
         assert_eq_m256i(r, e);
     }
@@ -3054,9 +3125,9 @@ mod tests {
     unsafe fn test_mm256_mask_shldi_epi32() {
         let a = _mm256_set1_epi32(1);
         let b = _mm256_set1_epi32(1 << 31);
-        let r = _mm256_mask_shldi_epi32(a, 0, a, b, 2);
+        let r = _mm256_mask_shldi_epi32::<2>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldi_epi32(a, 0b11111111, a, b, 2);
+        let r = _mm256_mask_shldi_epi32::<2>(a, 0b11111111, a, b);
         let e = _mm256_set1_epi32(6);
         assert_eq_m256i(r, e);
     }
@@ -3065,9 +3136,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shldi_epi32() {
         let a = _mm256_set1_epi32(1);
         let b = _mm256_set1_epi32(1 << 31);
-        let r = _mm256_maskz_shldi_epi32(0, a, b, 2);
+        let r = _mm256_maskz_shldi_epi32::<2>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldi_epi32(0b11111111, a, b, 2);
+        let r = _mm256_maskz_shldi_epi32::<2>(0b11111111, a, b);
         let e = _mm256_set1_epi32(6);
         assert_eq_m256i(r, e);
     }
@@ -3076,7 +3147,7 @@ mod tests {
     unsafe fn test_mm_shldi_epi32() {
         let a = _mm_set1_epi32(1);
         let b = _mm_set1_epi32(1 << 31);
-        let r = _mm_shldi_epi32(a, b, 2);
+        let r = _mm_shldi_epi32::<2>(a, b);
         let e = _mm_set1_epi32(6);
         assert_eq_m128i(r, e);
     }
@@ -3085,9 +3156,9 @@ mod tests {
     unsafe fn test_mm_mask_shldi_epi32() {
         let a = _mm_set1_epi32(1);
         let b = _mm_set1_epi32(1 << 31);
-        let r = _mm_mask_shldi_epi32(a, 0, a, b, 2);
+        let r = _mm_mask_shldi_epi32::<2>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shldi_epi32(a, 0b00001111, a, b, 2);
+        let r = _mm_mask_shldi_epi32::<2>(a, 0b00001111, a, b);
         let e = _mm_set1_epi32(6);
         assert_eq_m128i(r, e);
     }
@@ -3096,9 +3167,9 @@ mod tests {
     unsafe fn test_mm_maskz_shldi_epi32() {
         let a = _mm_set1_epi32(1);
         let b = _mm_set1_epi32(1 << 31);
-        let r = _mm_maskz_shldi_epi32(0, a, b, 2);
+        let r = _mm_maskz_shldi_epi32::<2>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldi_epi32(0b00001111, a, b, 2);
+        let r = _mm_maskz_shldi_epi32::<2>(0b00001111, a, b);
         let e = _mm_set1_epi32(6);
         assert_eq_m128i(r, e);
     }
@@ -3107,7 +3178,7 @@ mod tests {
     unsafe fn test_mm512_shldi_epi16() {
         let a = _mm512_set1_epi16(1);
         let b = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_shldi_epi16(a, b, 2);
+        let r = _mm512_shldi_epi16::<2>(a, b);
         let e = _mm512_set1_epi16(6);
         assert_eq_m512i(r, e);
     }
@@ -3116,9 +3187,9 @@ mod tests {
     unsafe fn test_mm512_mask_shldi_epi16() {
         let a = _mm512_set1_epi16(1);
         let b = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_mask_shldi_epi16(a, 0, a, b, 2);
+        let r = _mm512_mask_shldi_epi16::<2>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b, 2);
+        let r = _mm512_mask_shldi_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a, b);
         let e = _mm512_set1_epi16(6);
         assert_eq_m512i(r, e);
     }
@@ -3127,9 +3198,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shldi_epi16() {
         let a = _mm512_set1_epi16(1);
         let b = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_maskz_shldi_epi16(0, a, b, 2);
+        let r = _mm512_maskz_shldi_epi16::<2>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldi_epi16(0b11111111_11111111_11111111_11111111, a, b, 2);
+        let r = _mm512_maskz_shldi_epi16::<2>(0b11111111_11111111_11111111_11111111, a, b);
         let e = _mm512_set1_epi16(6);
         assert_eq_m512i(r, e);
     }
@@ -3138,7 +3209,7 @@ mod tests {
     unsafe fn test_mm256_shldi_epi16() {
         let a = _mm256_set1_epi16(1);
         let b = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_shldi_epi16(a, b, 2);
+        let r = _mm256_shldi_epi16::<2>(a, b);
         let e = _mm256_set1_epi16(6);
         assert_eq_m256i(r, e);
     }
@@ -3147,9 +3218,9 @@ mod tests {
     unsafe fn test_mm256_mask_shldi_epi16() {
         let a = _mm256_set1_epi16(1);
         let b = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_mask_shldi_epi16(a, 0, a, b, 2);
+        let r = _mm256_mask_shldi_epi16::<2>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldi_epi16(a, 0b11111111_11111111, a, b, 2);
+        let r = _mm256_mask_shldi_epi16::<2>(a, 0b11111111_11111111, a, b);
         let e = _mm256_set1_epi16(6);
         assert_eq_m256i(r, e);
     }
@@ -3158,9 +3229,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shldi_epi16() {
         let a = _mm256_set1_epi16(1);
         let b = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_maskz_shldi_epi16(0, a, b, 2);
+        let r = _mm256_maskz_shldi_epi16::<2>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldi_epi16(0b11111111_11111111, a, b, 2);
+        let r = _mm256_maskz_shldi_epi16::<2>(0b11111111_11111111, a, b);
         let e = _mm256_set1_epi16(6);
         assert_eq_m256i(r, e);
     }
@@ -3169,7 +3240,7 @@ mod tests {
     unsafe fn test_mm_shldi_epi16() {
         let a = _mm_set1_epi16(1);
         let b = _mm_set1_epi16(1 << 15);
-        let r = _mm_shldi_epi16(a, b, 2);
+        let r = _mm_shldi_epi16::<2>(a, b);
         let e = _mm_set1_epi16(6);
         assert_eq_m128i(r, e);
     }
@@ -3178,9 +3249,9 @@ mod tests {
     unsafe fn test_mm_mask_shldi_epi16() {
         let a = _mm_set1_epi16(1);
         let b = _mm_set1_epi16(1 << 15);
-        let r = _mm_mask_shldi_epi16(a, 0, a, b, 2);
+        let r = _mm_mask_shldi_epi16::<2>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shldi_epi16(a, 0b11111111, a, b, 2);
+        let r = _mm_mask_shldi_epi16::<2>(a, 0b11111111, a, b);
         let e = _mm_set1_epi16(6);
         assert_eq_m128i(r, e);
     }
@@ -3189,9 +3260,9 @@ mod tests {
     unsafe fn test_mm_maskz_shldi_epi16() {
         let a = _mm_set1_epi16(1);
         let b = _mm_set1_epi16(1 << 15);
-        let r = _mm_maskz_shldi_epi16(0, a, b, 2);
+        let r = _mm_maskz_shldi_epi16::<2>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldi_epi16(0b11111111, a, b, 2);
+        let r = _mm_maskz_shldi_epi16::<2>(0b11111111, a, b);
         let e = _mm_set1_epi16(6);
         assert_eq_m128i(r, e);
     }
@@ -3200,7 +3271,7 @@ mod tests {
     unsafe fn test_mm512_shrdi_epi64() {
         let a = _mm512_set1_epi64(8);
         let b = _mm512_set1_epi64(2);
-        let r = _mm512_shrdi_epi64(a, b, 1);
+        let r = _mm512_shrdi_epi64::<1>(a, b);
         let e = _mm512_set1_epi64(1);
         assert_eq_m512i(r, e);
     }
@@ -3209,9 +3280,9 @@ mod tests {
     unsafe fn test_mm512_mask_shrdi_epi64() {
         let a = _mm512_set1_epi64(8);
         let b = _mm512_set1_epi64(2);
-        let r = _mm512_mask_shrdi_epi64(a, 0, a, b, 1);
+        let r = _mm512_mask_shrdi_epi64::<1>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdi_epi64(a, 0b11111111, a, b, 1);
+        let r = _mm512_mask_shrdi_epi64::<1>(a, 0b11111111, a, b);
         let e = _mm512_set1_epi64(1);
         assert_eq_m512i(r, e);
     }
@@ -3220,9 +3291,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shrdi_epi64() {
         let a = _mm512_set1_epi64(8);
         let b = _mm512_set1_epi64(2);
-        let r = _mm512_maskz_shrdi_epi64(0, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi64::<1>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdi_epi64(0b11111111, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi64::<1>(0b11111111, a, b);
         let e = _mm512_set1_epi64(1);
         assert_eq_m512i(r, e);
     }
@@ -3231,7 +3302,7 @@ mod tests {
     unsafe fn test_mm256_shrdi_epi64() {
         let a = _mm256_set1_epi64x(8);
         let b = _mm256_set1_epi64x(2);
-        let r = _mm256_shrdi_epi64(a, b, 1);
+        let r = _mm256_shrdi_epi64::<1>(a, b);
         let e = _mm256_set1_epi64x(1);
         assert_eq_m256i(r, e);
     }
@@ -3240,9 +3311,9 @@ mod tests {
     unsafe fn test_mm256_mask_shrdi_epi64() {
         let a = _mm256_set1_epi64x(8);
         let b = _mm256_set1_epi64x(2);
-        let r = _mm256_mask_shrdi_epi64(a, 0, a, b, 1);
+        let r = _mm256_mask_shrdi_epi64::<1>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdi_epi64(a, 0b00001111, a, b, 1);
+        let r = _mm256_mask_shrdi_epi64::<1>(a, 0b00001111, a, b);
         let e = _mm256_set1_epi64x(1);
         assert_eq_m256i(r, e);
     }
@@ -3251,9 +3322,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shrdi_epi64() {
         let a = _mm256_set1_epi64x(8);
         let b = _mm256_set1_epi64x(2);
-        let r = _mm256_maskz_shrdi_epi64(0, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi64::<1>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdi_epi64(0b00001111, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi64::<1>(0b00001111, a, b);
         let e = _mm256_set1_epi64x(1);
         assert_eq_m256i(r, e);
     }
@@ -3262,7 +3333,7 @@ mod tests {
     unsafe fn test_mm_shrdi_epi64() {
         let a = _mm_set1_epi64x(8);
         let b = _mm_set1_epi64x(2);
-        let r = _mm_shrdi_epi64(a, b, 1);
+        let r = _mm_shrdi_epi64::<1>(a, b);
         let e = _mm_set1_epi64x(1);
         assert_eq_m128i(r, e);
     }
@@ -3271,9 +3342,9 @@ mod tests {
     unsafe fn test_mm_mask_shrdi_epi64() {
         let a = _mm_set1_epi64x(8);
         let b = _mm_set1_epi64x(2);
-        let r = _mm_mask_shrdi_epi64(a, 0, a, b, 1);
+        let r = _mm_mask_shrdi_epi64::<1>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdi_epi64(a, 0b00000011, a, b, 1);
+        let r = _mm_mask_shrdi_epi64::<1>(a, 0b00000011, a, b);
         let e = _mm_set1_epi64x(1);
         assert_eq_m128i(r, e);
     }
@@ -3282,9 +3353,9 @@ mod tests {
     unsafe fn test_mm_maskz_shrdi_epi64() {
         let a = _mm_set1_epi64x(8);
         let b = _mm_set1_epi64x(2);
-        let r = _mm_maskz_shrdi_epi64(0, a, b, 1);
+        let r = _mm_maskz_shrdi_epi64::<1>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdi_epi64(0b00000011, a, b, 1);
+        let r = _mm_maskz_shrdi_epi64::<1>(0b00000011, a, b);
         let e = _mm_set1_epi64x(1);
         assert_eq_m128i(r, e);
     }
@@ -3293,7 +3364,7 @@ mod tests {
     unsafe fn test_mm512_shrdi_epi32() {
         let a = _mm512_set1_epi32(8);
         let b = _mm512_set1_epi32(2);
-        let r = _mm512_shrdi_epi32(a, b, 1);
+        let r = _mm512_shrdi_epi32::<1>(a, b);
         let e = _mm512_set1_epi32(1);
         assert_eq_m512i(r, e);
     }
@@ -3302,9 +3373,9 @@ mod tests {
     unsafe fn test_mm512_mask_shrdi_epi32() {
         let a = _mm512_set1_epi32(8);
         let b = _mm512_set1_epi32(2);
-        let r = _mm512_mask_shrdi_epi32(a, 0, a, b, 1);
+        let r = _mm512_mask_shrdi_epi32::<1>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdi_epi32(a, 0b11111111_11111111, a, b, 1);
+        let r = _mm512_mask_shrdi_epi32::<1>(a, 0b11111111_11111111, a, b);
         let e = _mm512_set1_epi32(1);
         assert_eq_m512i(r, e);
     }
@@ -3313,9 +3384,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shrdi_epi32() {
         let a = _mm512_set1_epi32(8);
         let b = _mm512_set1_epi32(2);
-        let r = _mm512_maskz_shrdi_epi32(0, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi32::<1>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdi_epi32(0b11111111_11111111, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi32::<1>(0b11111111_11111111, a, b);
         let e = _mm512_set1_epi32(1);
         assert_eq_m512i(r, e);
     }
@@ -3324,7 +3395,7 @@ mod tests {
     unsafe fn test_mm256_shrdi_epi32() {
         let a = _mm256_set1_epi32(8);
         let b = _mm256_set1_epi32(2);
-        let r = _mm256_shrdi_epi32(a, b, 1);
+        let r = _mm256_shrdi_epi32::<1>(a, b);
         let e = _mm256_set1_epi32(1);
         assert_eq_m256i(r, e);
     }
@@ -3333,9 +3404,9 @@ mod tests {
     unsafe fn test_mm256_mask_shrdi_epi32() {
         let a = _mm256_set1_epi32(8);
         let b = _mm256_set1_epi32(2);
-        let r = _mm256_mask_shrdi_epi32(a, 0, a, b, 1);
+        let r = _mm256_mask_shrdi_epi32::<1>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdi_epi32(a, 0b11111111, a, b, 1);
+        let r = _mm256_mask_shrdi_epi32::<1>(a, 0b11111111, a, b);
         let e = _mm256_set1_epi32(1);
         assert_eq_m256i(r, e);
     }
@@ -3344,9 +3415,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shrdi_epi32() {
         let a = _mm256_set1_epi32(8);
         let b = _mm256_set1_epi32(2);
-        let r = _mm256_maskz_shrdi_epi32(0, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi32::<1>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdi_epi32(0b11111111, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi32::<1>(0b11111111, a, b);
         let e = _mm256_set1_epi32(1);
         assert_eq_m256i(r, e);
     }
@@ -3355,7 +3426,7 @@ mod tests {
     unsafe fn test_mm_shrdi_epi32() {
         let a = _mm_set1_epi32(8);
         let b = _mm_set1_epi32(2);
-        let r = _mm_shrdi_epi32(a, b, 1);
+        let r = _mm_shrdi_epi32::<1>(a, b);
         let e = _mm_set1_epi32(1);
         assert_eq_m128i(r, e);
     }
@@ -3364,9 +3435,9 @@ mod tests {
     unsafe fn test_mm_mask_shrdi_epi32() {
         let a = _mm_set1_epi32(8);
         let b = _mm_set1_epi32(2);
-        let r = _mm_mask_shrdi_epi32(a, 0, a, b, 1);
+        let r = _mm_mask_shrdi_epi32::<1>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdi_epi32(a, 0b00001111, a, b, 1);
+        let r = _mm_mask_shrdi_epi32::<1>(a, 0b00001111, a, b);
         let e = _mm_set1_epi32(1);
         assert_eq_m128i(r, e);
     }
@@ -3375,9 +3446,9 @@ mod tests {
     unsafe fn test_mm_maskz_shrdi_epi32() {
         let a = _mm_set1_epi32(8);
         let b = _mm_set1_epi32(2);
-        let r = _mm_maskz_shrdi_epi32(0, a, b, 1);
+        let r = _mm_maskz_shrdi_epi32::<1>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdi_epi32(0b00001111, a, b, 1);
+        let r = _mm_maskz_shrdi_epi32::<1>(0b00001111, a, b);
         let e = _mm_set1_epi32(1);
         assert_eq_m128i(r, e);
     }
@@ -3386,7 +3457,7 @@ mod tests {
     unsafe fn test_mm512_shrdi_epi16() {
         let a = _mm512_set1_epi16(8);
         let b = _mm512_set1_epi16(2);
-        let r = _mm512_shrdi_epi16(a, b, 1);
+        let r = _mm512_shrdi_epi16::<1>(a, b);
         let e = _mm512_set1_epi16(1);
         assert_eq_m512i(r, e);
     }
@@ -3395,9 +3466,9 @@ mod tests {
     unsafe fn test_mm512_mask_shrdi_epi16() {
         let a = _mm512_set1_epi16(8);
         let b = _mm512_set1_epi16(2);
-        let r = _mm512_mask_shrdi_epi16(a, 0, a, b, 1);
+        let r = _mm512_mask_shrdi_epi16::<1>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b, 1);
+        let r = _mm512_mask_shrdi_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a, b);
         let e = _mm512_set1_epi16(1);
         assert_eq_m512i(r, e);
     }
@@ -3406,9 +3477,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shrdi_epi16() {
         let a = _mm512_set1_epi16(8);
         let b = _mm512_set1_epi16(2);
-        let r = _mm512_maskz_shrdi_epi16(0, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi16::<1>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdi_epi16(0b11111111_11111111_11111111_11111111, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi16::<1>(0b11111111_11111111_11111111_11111111, a, b);
         let e = _mm512_set1_epi16(1);
         assert_eq_m512i(r, e);
     }
@@ -3417,7 +3488,7 @@ mod tests {
     unsafe fn test_mm256_shrdi_epi16() {
         let a = _mm256_set1_epi16(8);
         let b = _mm256_set1_epi16(2);
-        let r = _mm256_shrdi_epi16(a, b, 1);
+        let r = _mm256_shrdi_epi16::<1>(a, b);
         let e = _mm256_set1_epi16(1);
         assert_eq_m256i(r, e);
     }
@@ -3426,9 +3497,9 @@ mod tests {
     unsafe fn test_mm256_mask_shrdi_epi16() {
         let a = _mm256_set1_epi16(8);
         let b = _mm256_set1_epi16(2);
-        let r = _mm256_mask_shrdi_epi16(a, 0, a, b, 1);
+        let r = _mm256_mask_shrdi_epi16::<1>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdi_epi16(a, 0b11111111_11111111, a, b, 1);
+        let r = _mm256_mask_shrdi_epi16::<1>(a, 0b11111111_11111111, a, b);
         let e = _mm256_set1_epi16(1);
         assert_eq_m256i(r, e);
     }
@@ -3437,9 +3508,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shrdi_epi16() {
         let a = _mm256_set1_epi16(8);
         let b = _mm256_set1_epi16(2);
-        let r = _mm256_maskz_shrdi_epi16(0, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi16::<1>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdi_epi16(0b11111111_11111111, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi16::<1>(0b11111111_11111111, a, b);
         let e = _mm256_set1_epi16(1);
         assert_eq_m256i(r, e);
     }
@@ -3448,7 +3519,7 @@ mod tests {
     unsafe fn test_mm_shrdi_epi16() {
         let a = _mm_set1_epi16(8);
         let b = _mm_set1_epi16(2);
-        let r = _mm_shrdi_epi16(a, b, 1);
+        let r = _mm_shrdi_epi16::<1>(a, b);
         let e = _mm_set1_epi16(1);
         assert_eq_m128i(r, e);
     }
@@ -3457,9 +3528,9 @@ mod tests {
     unsafe fn test_mm_mask_shrdi_epi16() {
         let a = _mm_set1_epi16(8);
         let b = _mm_set1_epi16(2);
-        let r = _mm_mask_shrdi_epi16(a, 0, a, b, 1);
+        let r = _mm_mask_shrdi_epi16::<1>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdi_epi16(a, 0b11111111, a, b, 1);
+        let r = _mm_mask_shrdi_epi16::<1>(a, 0b11111111, a, b);
         let e = _mm_set1_epi16(1);
         assert_eq_m128i(r, e);
     }
@@ -3468,9 +3539,9 @@ mod tests {
     unsafe fn test_mm_maskz_shrdi_epi16() {
         let a = _mm_set1_epi16(8);
         let b = _mm_set1_epi16(2);
-        let r = _mm_maskz_shrdi_epi16(0, a, b, 1);
+        let r = _mm_maskz_shrdi_epi16::<1>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdi_epi16(0b11111111, a, b, 1);
+        let r = _mm_maskz_shrdi_epi16::<1>(0b11111111, a, b);
         let e = _mm_set1_epi16(1);
         assert_eq_m128i(r, e);
     }

From a2b7f9d4b159e05b8fa305d9222d370523d89cd5 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Fri, 5 Mar 2021 00:33:09 +0000
Subject: [PATCH 017/123] ror_epi32,epi64, rol_epi32_epi64, srai_epi32

---
 crates/core_arch/src/x86/avx512f.rs    | 760 +++++++++++--------------
 crates/core_arch/src/x86_64/avx512f.rs |  60 +-
 2 files changed, 360 insertions(+), 460 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 5abe23e093..5e5104b618 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -16624,16 +16624,12 @@ pub unsafe fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi32&expand=4685)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_rol_epi32(a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprold(a, IMM8);
     transmute(r)
 }
 
@@ -16642,17 +16638,17 @@ pub unsafe fn _mm512_rol_epi32(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi32&expand=4683)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_rol_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_rol_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
+    let r = vprold(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16660,18 +16656,14 @@ pub unsafe fn _mm512_mask_rol_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rol_epi32&expand=4684)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_rol_epi32(k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprold(a, IMM8);
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -16679,16 +16671,12 @@ pub unsafe fn _mm512_maskz_rol_epi32(k: __mmask16, a: __m512i, imm8: i32) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rol_epi32&expand=4682)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm256_rol_epi32(a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold256(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprold256(a, IMM8);
     transmute(r)
 }
 
@@ -16697,17 +16685,17 @@ pub unsafe fn _mm256_rol_epi32(a: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rol_epi32&expand=4680)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_rol_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_rol_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold256(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i32x8()))
+    let r = vprold256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16715,18 +16703,14 @@ pub unsafe fn _mm256_mask_rol_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rol_epi32&expand=4681)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_rol_epi32(k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold256(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprold256(a, IMM8);
     let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -16734,16 +16718,12 @@ pub unsafe fn _mm256_maskz_rol_epi32(k: __mmask8, a: __m256i, imm8: i32) -> __m2
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rol_epi32&expand=4679)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_rol_epi32(a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold128(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprold128(a, IMM8);
     transmute(r)
 }
 
@@ -16752,17 +16732,17 @@ pub unsafe fn _mm_rol_epi32(a: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rol_epi32&expand=4677)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_rol_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_rol_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold128(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i32x4()))
+    let r = vprold128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16770,18 +16750,14 @@ pub unsafe fn _mm_mask_rol_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: i3
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rol_epi32&expand=4678)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_rol_epi32(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold128(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprold128(a, IMM8);
     let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -16789,16 +16765,12 @@ pub unsafe fn _mm_maskz_rol_epi32(k: __mmask8, a: __m128i, imm8: i32) -> __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi32&expand=4721)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_ror_epi32(a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprord(a, IMM8);
     transmute(r)
 }
 
@@ -16807,17 +16779,17 @@ pub unsafe fn _mm512_ror_epi32(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi32&expand=4719)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_ror_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_ror_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
+    let r = vprord(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16825,18 +16797,14 @@ pub unsafe fn _mm512_mask_ror_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi32&expand=4720)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_ror_epi32(k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprord(a, IMM8);
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -16844,16 +16812,12 @@ pub unsafe fn _mm512_maskz_ror_epi32(k: __mmask16, a: __m512i, imm8: i32) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ror_epi32&expand=4718)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm256_ror_epi32(a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord256(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprord256(a, IMM8);
     transmute(r)
 }
 
@@ -16862,17 +16826,17 @@ pub unsafe fn _mm256_ror_epi32(a: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ror_epi32&expand=4716)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_ror_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_ror_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord256(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i32x8()))
+    let r = vprord256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16880,18 +16844,14 @@ pub unsafe fn _mm256_mask_ror_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ror_epi32&expand=4717)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_ror_epi32(k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord256(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprord256(a, IMM8);
     let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -16899,16 +16859,12 @@ pub unsafe fn _mm256_maskz_ror_epi32(k: __mmask8, a: __m256i, imm8: i32) -> __m2
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ror_epi32&expand=4715)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_ror_epi32(a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord128(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprord128(a, IMM8);
     transmute(r)
 }
 
@@ -16917,17 +16873,17 @@ pub unsafe fn _mm_ror_epi32(a: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ror_epi32&expand=4713)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_ror_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_ror_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord128(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i32x4()))
+    let r = vprord128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16935,18 +16891,14 @@ pub unsafe fn _mm_mask_ror_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: i3
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ror_epi32&expand=4714)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_ror_epi32(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord128(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprord128(a, IMM8);
     let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -16954,16 +16906,12 @@ pub unsafe fn _mm_maskz_ror_epi32(k: __mmask8, a: __m128i, imm8: i32) -> __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi64&expand=4694)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_rol_epi64(a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprolq(a, IMM8);
     transmute(r)
 }
 
@@ -16972,17 +16920,17 @@ pub unsafe fn _mm512_rol_epi64(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi64&expand=4692)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_rol_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_rol_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
+    let r = vprolq(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16990,18 +16938,14 @@ pub unsafe fn _mm512_mask_rol_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rol_epi64&expand=4693)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_rol_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprolq(a, IMM8);
     let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -17009,16 +16953,12 @@ pub unsafe fn _mm512_maskz_rol_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m5
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rol_epi64&expand=4691)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm256_rol_epi64(a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq256(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprolq256(a, IMM8);
     transmute(r)
 }
 
@@ -17027,17 +16967,17 @@ pub unsafe fn _mm256_rol_epi64(a: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rol_epi64&expand=4689)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_rol_epi64(src: __m256i, k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_rol_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq256(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i64x4()))
+    let r = vprolq256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17045,18 +16985,14 @@ pub unsafe fn _mm256_mask_rol_epi64(src: __m256i, k: __mmask8, a: __m256i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rol_epi64&expand=4690)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_rol_epi64(k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq256(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprolq256(a, IMM8);
     let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -17064,16 +17000,12 @@ pub unsafe fn _mm256_maskz_rol_epi64(k: __mmask8, a: __m256i, imm8: i32) -> __m2
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rol_epi64&expand=4688)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_rol_epi64(a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq128(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprolq128(a, IMM8);
     transmute(r)
 }
 
@@ -17082,17 +17014,17 @@ pub unsafe fn _mm_rol_epi64(a: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rol_epi64&expand=4686)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_rol_epi64(src: __m128i, k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_rol_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq128(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i64x2()))
+    let r = vprolq128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17100,18 +17032,14 @@ pub unsafe fn _mm_mask_rol_epi64(src: __m128i, k: __mmask8, a: __m128i, imm8: i3
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rol_epi64&expand=4687)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_rol_epi64(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq128(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprolq128(a, IMM8);
     let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -17119,16 +17047,12 @@ pub unsafe fn _mm_maskz_rol_epi64(k: __mmask8, a: __m128i, imm8: i32) -> __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi64&expand=4730)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_ror_epi64(a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprorq(a, IMM8);
     transmute(r)
 }
 
@@ -17137,17 +17061,17 @@ pub unsafe fn _mm512_ror_epi64(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi64&expand=4728)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_ror_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_ror_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
+    let r = vprorq(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17155,18 +17079,14 @@ pub unsafe fn _mm512_mask_ror_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi64&expand=4729)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_ror_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprorq(a, IMM8);
     let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -17174,16 +17094,12 @@ pub unsafe fn _mm512_maskz_ror_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m5
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ror_epi64&expand=4727)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm256_ror_epi64(a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq256(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprorq256(a, IMM8);
     transmute(r)
 }
 
@@ -17192,17 +17108,17 @@ pub unsafe fn _mm256_ror_epi64(a: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ror_epi64&expand=4725)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_ror_epi64(src: __m256i, k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_ror_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq256(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i64x4()))
+    let r = vprorq256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17210,18 +17126,14 @@ pub unsafe fn _mm256_mask_ror_epi64(src: __m256i, k: __mmask8, a: __m256i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ror_epi64&expand=4726)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_ror_epi64(k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq256(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprorq256(a, IMM8);
     let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -17229,16 +17141,12 @@ pub unsafe fn _mm256_maskz_ror_epi64(k: __mmask8, a: __m256i, imm8: i32) -> __m2
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ror_epi64&expand=4724)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_ror_epi64(a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq128(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprorq128(a, IMM8);
     transmute(r)
 }
 
@@ -17247,17 +17155,17 @@ pub unsafe fn _mm_ror_epi64(a: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ror_epi64&expand=4722)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_ror_epi64(src: __m128i, k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_ror_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq128(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i64x2()))
+    let r = vprorq128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17265,18 +17173,14 @@ pub unsafe fn _mm_mask_ror_epi64(src: __m128i, k: __mmask8, a: __m128i, imm8: i3
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ror_epi64&expand=4723)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_ror_epi64(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq128(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprorq128(a, IMM8);
     let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
@@ -18343,7 +18247,7 @@ pub unsafe fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __
 pub unsafe fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    let r = vpsraid(a, IMM8);
+    let r = vpsraid512(a, IMM8);
     transmute(r)
 }
 
@@ -18361,7 +18265,7 @@ pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
 ) -> __m512i {
     static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    let r = vpsraid(a, IMM8);
+    let r = vpsraid512(a, IMM8);
     transmute(simd_select_bitmask(k, r, src.as_i32x16()))
 }
 
@@ -18375,7 +18279,7 @@ pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
 pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
     static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    let r = vpsraid(a, IMM8);
+    let r = vpsraid512(a, IMM8);
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, r, zero))
 }
@@ -18385,16 +18289,16 @@ pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srai_epi32&expand=5431)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_srai_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8: u32) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_srai_epi32::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, shf.as_i32x8(), src.as_i32x8()))
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_srai_epi32<const IMM8: u32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    let imm8 = IMM8 as i32;
+    let r = psraid256(a.as_i32x8(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18402,17 +18306,13 @@ pub unsafe fn _mm256_mask_srai_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srai_epi32&expand=5432)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_srai_epi32(k: __mmask8, a: __m256i, imm8: u32) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_srai_epi32::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    let imm8 = IMM8 as i32;
+    let r = psraid256(a.as_i32x8(), imm8);
     let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, shf.as_i32x8(), zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18420,16 +18320,16 @@ pub unsafe fn _mm256_maskz_srai_epi32(k: __mmask8, a: __m256i, imm8: u32) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srai_epi32&expand=5428)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_srai_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: u32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_srai_epi32::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, shf.as_i32x4(), src.as_i32x4()))
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_srai_epi32<const IMM8: u32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    let imm8 = IMM8 as i32;
+    let r = psraid128(a.as_i32x4(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18437,17 +18337,13 @@ pub unsafe fn _mm_mask_srai_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: u
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srai_epi32&expand=5429)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_srai_epi32(k: __mmask8, a: __m128i, imm8: u32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_srai_epi32::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    let imm8 = IMM8 as i32;
+    let r = psraid128(a.as_i32x4(), imm8);
     let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, shf.as_i32x4(), zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
@@ -37615,7 +37511,11 @@ extern "C" {
     fn vpsraq128(a: i64x2, count: i64x2) -> i64x2;
 
     #[link_name = "llvm.x86.avx512.psrai.d.512"]
-    fn vpsraid(a: i32x16, imm8: u32) -> i32x16;
+    fn vpsraid512(a: i32x16, imm8: u32) -> i32x16;
+    #[link_name = "llvm.x86.avx2.psrai.d"]
+    fn psraid256(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.sse2.psrai.d"]
+    fn psraid128(a: i32x4, imm8: i32) -> i32x4;
 
     #[link_name = "llvm.x86.avx512.psrai.q.512"]
     fn vpsraiq(a: i64x8, imm8: u32) -> i64x8;
@@ -45648,7 +45548,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_rol_epi32() {
         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm512_rol_epi32(a, 1);
+        let r = _mm512_rol_epi32::<1>(a);
         let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
         assert_eq_m512i(r, e);
     }
@@ -45656,9 +45556,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_rol_epi32() {
         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm512_mask_rol_epi32(a, 0, a, 1);
+        let r = _mm512_mask_rol_epi32::<1>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_rol_epi32(a, 0b11111111_11111111, a, 1);
+        let r = _mm512_mask_rol_epi32::<1>(a, 0b11111111_11111111, a);
         let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
         assert_eq_m512i(r, e);
     }
@@ -45666,9 +45566,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_rol_epi32() {
         let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
-        let r = _mm512_maskz_rol_epi32(0, a, 1);
+        let r = _mm512_maskz_rol_epi32::<1>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_rol_epi32(0b00000000_11111111, a, 1);
+        let r = _mm512_maskz_rol_epi32::<1>(0b00000000_11111111, a);
         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
         assert_eq_m512i(r, e);
     }
@@ -45676,7 +45576,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_rol_epi32() {
         let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_rol_epi32(a, 1);
+        let r = _mm256_rol_epi32::<1>(a);
         let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
         assert_eq_m256i(r, e);
     }
@@ -45684,9 +45584,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_rol_epi32() {
         let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_mask_rol_epi32(a, 0, a, 1);
+        let r = _mm256_mask_rol_epi32::<1>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_rol_epi32(a, 0b11111111, a, 1);
+        let r = _mm256_mask_rol_epi32::<1>(a, 0b11111111, a);
         let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
         assert_eq_m256i(r, e);
     }
@@ -45694,9 +45594,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_rol_epi32() {
         let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_maskz_rol_epi32(0, a, 1);
+        let r = _mm256_maskz_rol_epi32::<1>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_rol_epi32(0b11111111, a, 1);
+        let r = _mm256_maskz_rol_epi32::<1>(0b11111111, a);
         let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
         assert_eq_m256i(r, e);
     }
@@ -45704,7 +45604,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_rol_epi32() {
         let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_rol_epi32(a, 1);
+        let r = _mm_rol_epi32::<1>(a);
         let e = _mm_set_epi32(1 << 0, 2, 2, 2);
         assert_eq_m128i(r, e);
     }
@@ -45712,9 +45612,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_rol_epi32() {
         let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_mask_rol_epi32(a, 0, a, 1);
+        let r = _mm_mask_rol_epi32::<1>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_rol_epi32(a, 0b00001111, a, 1);
+        let r = _mm_mask_rol_epi32::<1>(a, 0b00001111, a);
         let e = _mm_set_epi32(1 << 0, 2, 2, 2);
         assert_eq_m128i(r, e);
     }
@@ -45722,9 +45622,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_rol_epi32() {
         let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_maskz_rol_epi32(0, a, 1);
+        let r = _mm_maskz_rol_epi32::<1>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_rol_epi32(0b00001111, a, 1);
+        let r = _mm_maskz_rol_epi32::<1>(0b00001111, a);
         let e = _mm_set_epi32(1 << 0, 2, 2, 2);
         assert_eq_m128i(r, e);
     }
@@ -45732,7 +45632,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_ror_epi32() {
         let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm512_ror_epi32(a, 1);
+        let r = _mm512_ror_epi32::<1>(a);
         let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
         assert_eq_m512i(r, e);
     }
@@ -45740,9 +45640,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_ror_epi32() {
         let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm512_mask_ror_epi32(a, 0, a, 1);
+        let r = _mm512_mask_ror_epi32::<1>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_ror_epi32(a, 0b11111111_11111111, a, 1);
+        let r = _mm512_mask_ror_epi32::<1>(a, 0b11111111_11111111, a);
         let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
         assert_eq_m512i(r, e);
     }
@@ -45750,9 +45650,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_ror_epi32() {
         let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
-        let r = _mm512_maskz_ror_epi32(0, a, 1);
+        let r = _mm512_maskz_ror_epi32::<1>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_ror_epi32(0b00000000_11111111, a, 1);
+        let r = _mm512_maskz_ror_epi32::<1>(0b00000000_11111111, a);
         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
         assert_eq_m512i(r, e);
     }
@@ -45760,7 +45660,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_ror_epi32() {
         let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm256_ror_epi32(a, 1);
+        let r = _mm256_ror_epi32::<1>(a);
         let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
         assert_eq_m256i(r, e);
     }
@@ -45768,9 +45668,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_ror_epi32() {
         let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm256_mask_ror_epi32(a, 0, a, 1);
+        let r = _mm256_mask_ror_epi32::<1>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_ror_epi32(a, 0b11111111, a, 1);
+        let r = _mm256_mask_ror_epi32::<1>(a, 0b11111111, a);
         let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
         assert_eq_m256i(r, e);
     }
@@ -45778,9 +45678,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_ror_epi32() {
         let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm256_maskz_ror_epi32(0, a, 1);
+        let r = _mm256_maskz_ror_epi32::<1>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_ror_epi32(0b11111111, a, 1);
+        let r = _mm256_maskz_ror_epi32::<1>(0b11111111, a);
         let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
         assert_eq_m256i(r, e);
     }
@@ -45788,7 +45688,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_ror_epi32() {
         let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let r = _mm_ror_epi32(a, 1);
+        let r = _mm_ror_epi32::<1>(a);
         let e = _mm_set_epi32(1 << 31, 1, 1, 1);
         assert_eq_m128i(r, e);
     }
@@ -45796,9 +45696,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_ror_epi32() {
         let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let r = _mm_mask_ror_epi32(a, 0, a, 1);
+        let r = _mm_mask_ror_epi32::<1>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_ror_epi32(a, 0b00001111, a, 1);
+        let r = _mm_mask_ror_epi32::<1>(a, 0b00001111, a);
         let e = _mm_set_epi32(1 << 31, 1, 1, 1);
         assert_eq_m128i(r, e);
     }
@@ -45806,9 +45706,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_ror_epi32() {
         let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let r = _mm_maskz_ror_epi32(0, a, 1);
+        let r = _mm_maskz_ror_epi32::<1>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_ror_epi32(0b00001111, a, 1);
+        let r = _mm_maskz_ror_epi32::<1>(0b00001111, a);
         let e = _mm_set_epi32(1 << 31, 1, 1, 1);
         assert_eq_m128i(r, e);
     }
@@ -46664,9 +46564,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_srai_epi32() {
         let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm256_mask_srai_epi32(a, 0, a, 1);
+        let r = _mm256_mask_srai_epi32::<1>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_srai_epi32(a, 0b11111111, a, 1);
+        let r = _mm256_mask_srai_epi32::<1>(a, 0b11111111, a);
         let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -46674,9 +46574,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_srai_epi32() {
         let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm256_maskz_srai_epi32(0, a, 1);
+        let r = _mm256_maskz_srai_epi32::<1>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srai_epi32(0b11111111, a, 1);
+        let r = _mm256_maskz_srai_epi32::<1>(0b11111111, a);
         let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -46684,9 +46584,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_srai_epi32() {
         let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let r = _mm_mask_srai_epi32(a, 0, a, 1);
+        let r = _mm_mask_srai_epi32::<1>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_srai_epi32(a, 0b00001111, a, 1);
+        let r = _mm_mask_srai_epi32::<1>(a, 0b00001111, a);
         let e = _mm_set_epi32(1 << 4, 0, 0, 0);
         assert_eq_m128i(r, e);
     }
@@ -46694,9 +46594,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_srai_epi32() {
         let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let r = _mm_maskz_srai_epi32(0, a, 1);
+        let r = _mm_maskz_srai_epi32::<1>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srai_epi32(0b00001111, a, 1);
+        let r = _mm_maskz_srai_epi32::<1>(0b00001111, a);
         let e = _mm_set_epi32(1 << 4, 0, 0, 0);
         assert_eq_m128i(r, e);
     }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index af62b2112c..84eab28e34 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -7627,7 +7627,7 @@ mod tests {
             1 << 63, 1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
         );
-        let r = _mm512_rol_epi64(a, 1);
+        let r = _mm512_rol_epi64::<1>(a);
         #[rustfmt::skip]
         let e = _mm512_set_epi64(
             1 << 0, 1 << 33, 1 << 33, 1 << 33,
@@ -7643,9 +7643,9 @@ mod tests {
             1 << 63, 1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
         );
-        let r = _mm512_mask_rol_epi64(a, 0, a, 1);
+        let r = _mm512_mask_rol_epi64::<1>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_rol_epi64(a, 0b11111111, a, 1);
+        let r = _mm512_mask_rol_epi64::<1>(a, 0b11111111, a);
         #[rustfmt::skip]
         let e = _mm512_set_epi64(
             1 << 0,  1 << 33, 1 << 33, 1 << 33,
@@ -7661,9 +7661,9 @@ mod tests {
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 63,
         );
-        let r = _mm512_maskz_rol_epi64(0, a, 1);
+        let r = _mm512_maskz_rol_epi64::<1>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_rol_epi64(0b00001111, a, 1);
+        let r = _mm512_maskz_rol_epi64::<1>(0b00001111, a);
         let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 33, 1 << 33, 1 << 33, 1 << 0);
         assert_eq_m512i(r, e);
     }
@@ -7671,7 +7671,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_rol_epi64() {
         let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_rol_epi64(a, 1);
+        let r = _mm256_rol_epi64::<1>(a);
         let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
         assert_eq_m256i(r, e);
     }
@@ -7679,9 +7679,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_rol_epi64() {
         let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_mask_rol_epi64(a, 0, a, 1);
+        let r = _mm256_mask_rol_epi64::<1>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_rol_epi64(a, 0b00001111, a, 1);
+        let r = _mm256_mask_rol_epi64::<1>(a, 0b00001111, a);
         let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
         assert_eq_m256i(r, e);
     }
@@ -7689,9 +7689,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_rol_epi64() {
         let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_maskz_rol_epi64(0, a, 1);
+        let r = _mm256_maskz_rol_epi64::<1>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_rol_epi64(0b00001111, a, 1);
+        let r = _mm256_maskz_rol_epi64::<1>(0b00001111, a);
         let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
         assert_eq_m256i(r, e);
     }
@@ -7699,7 +7699,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_rol_epi64() {
         let a = _mm_set_epi64x(1 << 63, 1 << 32);
-        let r = _mm_rol_epi64(a, 1);
+        let r = _mm_rol_epi64::<1>(a);
         let e = _mm_set_epi64x(1 << 0, 1 << 33);
         assert_eq_m128i(r, e);
     }
@@ -7707,9 +7707,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_rol_epi64() {
         let a = _mm_set_epi64x(1 << 63, 1 << 32);
-        let r = _mm_mask_rol_epi64(a, 0, a, 1);
+        let r = _mm_mask_rol_epi64::<1>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_rol_epi64(a, 0b00000011, a, 1);
+        let r = _mm_mask_rol_epi64::<1>(a, 0b00000011, a);
         let e = _mm_set_epi64x(1 << 0, 1 << 33);
         assert_eq_m128i(r, e);
     }
@@ -7717,9 +7717,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_rol_epi64() {
         let a = _mm_set_epi64x(1 << 63, 1 << 32);
-        let r = _mm_maskz_rol_epi64(0, a, 1);
+        let r = _mm_maskz_rol_epi64::<1>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_rol_epi64(0b00000011, a, 1);
+        let r = _mm_maskz_rol_epi64::<1>(0b00000011, a);
         let e = _mm_set_epi64x(1 << 0, 1 << 33);
         assert_eq_m128i(r, e);
     }
@@ -7731,7 +7731,7 @@ mod tests {
             1 << 0,  1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
         );
-        let r = _mm512_ror_epi64(a, 1);
+        let r = _mm512_ror_epi64::<1>(a);
         #[rustfmt::skip]
         let e = _mm512_set_epi64(
             1 << 63, 1 << 31, 1 << 31, 1 << 31,
@@ -7747,9 +7747,9 @@ mod tests {
             1 << 0,  1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
         );
-        let r = _mm512_mask_ror_epi64(a, 0, a, 1);
+        let r = _mm512_mask_ror_epi64::<1>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_ror_epi64(a, 0b11111111, a, 1);
+        let r = _mm512_mask_ror_epi64::<1>(a, 0b11111111, a);
         #[rustfmt::skip]
         let e = _mm512_set_epi64(
             1 << 63, 1 << 31, 1 << 31, 1 << 31,
@@ -7765,9 +7765,9 @@ mod tests {
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 0,
         );
-        let r = _mm512_maskz_ror_epi64(0, a, 1);
+        let r = _mm512_maskz_ror_epi64::<1>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_ror_epi64(0b00001111, a, 1);
+        let r = _mm512_maskz_ror_epi64::<1>(0b00001111, a);
         let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 31, 1 << 31, 1 << 31, 1 << 63);
         assert_eq_m512i(r, e);
     }
@@ -7775,7 +7775,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_ror_epi64() {
         let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_ror_epi64(a, 1);
+        let r = _mm256_ror_epi64::<1>(a);
         let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
         assert_eq_m256i(r, e);
     }
@@ -7783,9 +7783,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_ror_epi64() {
         let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_mask_ror_epi64(a, 0, a, 1);
+        let r = _mm256_mask_ror_epi64::<1>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_ror_epi64(a, 0b00001111, a, 1);
+        let r = _mm256_mask_ror_epi64::<1>(a, 0b00001111, a);
         let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
         assert_eq_m256i(r, e);
     }
@@ -7793,9 +7793,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_ror_epi64() {
         let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_maskz_ror_epi64(0, a, 1);
+        let r = _mm256_maskz_ror_epi64::<1>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_ror_epi64(0b00001111, a, 1);
+        let r = _mm256_maskz_ror_epi64::<1>(0b00001111, a);
         let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
         assert_eq_m256i(r, e);
     }
@@ -7803,7 +7803,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_ror_epi64() {
         let a = _mm_set_epi64x(1 << 0, 1 << 32);
-        let r = _mm_ror_epi64(a, 1);
+        let r = _mm_ror_epi64::<1>(a);
         let e = _mm_set_epi64x(1 << 63, 1 << 31);
         assert_eq_m128i(r, e);
     }
@@ -7811,9 +7811,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_ror_epi64() {
         let a = _mm_set_epi64x(1 << 0, 1 << 32);
-        let r = _mm_mask_ror_epi64(a, 0, a, 1);
+        let r = _mm_mask_ror_epi64::<1>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_ror_epi64(a, 0b00000011, a, 1);
+        let r = _mm_mask_ror_epi64::<1>(a, 0b00000011, a);
         let e = _mm_set_epi64x(1 << 63, 1 << 31);
         assert_eq_m128i(r, e);
     }
@@ -7821,9 +7821,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_ror_epi64() {
         let a = _mm_set_epi64x(1 << 0, 1 << 32);
-        let r = _mm_maskz_ror_epi64(0, a, 1);
+        let r = _mm_maskz_ror_epi64::<1>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_ror_epi64(0b00000011, a, 1);
+        let r = _mm_maskz_ror_epi64::<1>(0b00000011, a);
         let e = _mm_set_epi64x(1 << 63, 1 << 31);
         assert_eq_m128i(r, e);
     }

From f4b85d7bd66f90a596539502814bd72c54479ff0 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 01:28:36 +0000
Subject: [PATCH 018/123] cvt_roundps,pd_epi32,epu32; cvt_roundepi32,epu32_ps;
 cvt_roundpd_ps; mm_add,sub,mul,div_round_ss,sd; mm_sqrt_round_ss,sd;
 mm_scalf_round_ss,sd; mm_fmadd,fmsub,fnmadd,fnmsub_round_ss,sd;
 mm_cvt_roundss_i32,u32; mm_cvt_roundsd_i32,u32; mm_cvt_roundi32,u32_ss;
 mm_cvt_roundsd_ss

---
 crates/core_arch/src/x86/avx512f.rs    | 2186 ++++++++++--------------
 crates/core_arch/src/x86_64/avx512f.rs |   30 +-
 2 files changed, 878 insertions(+), 1338 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7bf8bdeae9..7911157eb2 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -13393,17 +13393,13 @@ pub unsafe fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_epi32&expand=1335)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2dq, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundps_epi32(a: __m512, rounding: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_epi32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2dq(a, zero, 0b11111111_11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2dq(a, zero, 0b11111111_11111111, ROUNDING);
     transmute(r)
 }
 
@@ -13419,22 +13415,17 @@ pub unsafe fn _mm512_cvt_roundps_epi32(a: __m512, rounding: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2dq, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_epi32(
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_epi32<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
-    rounding: i32,
 ) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let src = src.as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2dq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2dq(a, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -13450,17 +13441,16 @@ pub unsafe fn _mm512_mask_cvt_roundps_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2dq, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_epi32(k: __mmask16, a: __m512, rounding: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_epi32<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2dq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2dq(a, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -13476,17 +13466,13 @@ pub unsafe fn _mm512_maskz_cvt_roundps_epi32(k: __mmask16, a: __m512, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_epu32&expand=1341)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2udq, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundps_epu32(a: __m512, rounding: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_epu32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2udq(a, zero, 0b11111111_11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2udq(a, zero, 0b11111111_11111111, ROUNDING);
     transmute(r)
 }
 
@@ -13502,22 +13488,17 @@ pub unsafe fn _mm512_cvt_roundps_epu32(a: __m512, rounding: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epu32&expand=1342)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2udq, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_epu32(
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_epu32<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
-    rounding: i32,
 ) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let src = src.as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2udq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2udq(a, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -13533,17 +13514,16 @@ pub unsafe fn _mm512_mask_cvt_roundps_epu32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epu32&expand=1343)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2udq, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_epu32(k: __mmask16, a: __m512, rounding: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512,
+) -> __m512i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2udq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtps2udq(a, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -13624,17 +13604,13 @@ pub unsafe fn _mm512_maskz_cvt_roundps_pd(k: __mmask8, a: __m256, sae: i32) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_epi32&expand=1315)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundpd_epi32(a: __m512d, rounding: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundpd_epi32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2dq(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2dq(a, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -13650,22 +13626,17 @@ pub unsafe fn _mm512_cvt_roundpd_epi32(a: __m512d, rounding: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_epi32&expand=1316)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundpd_epi32(
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundpd_epi32<const ROUNDING: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
-    rounding: i32,
 ) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let src = src.as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2dq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2dq(a, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -13681,17 +13652,16 @@ pub unsafe fn _mm512_mask_cvt_roundpd_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epi32&expand=1317)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2dq, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundpd_epi32(k: __mmask8, a: __m512d, rounding: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundpd_epi32<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2dq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2dq(a, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -13707,17 +13677,13 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_epi32(k: __mmask8, a: __m512d, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_epu32&expand=1321)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundpd_epu32(a: __m512d, rounding: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundpd_epu32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_u32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2udq(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2udq(a, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -13733,22 +13699,17 @@ pub unsafe fn _mm512_cvt_roundpd_epu32(a: __m512d, rounding: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_epu32&expand=1322)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundpd_epu32(
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundpd_epu32<const ROUNDING: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
-    rounding: i32,
 ) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let src = src.as_u32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2udq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2udq(a, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -13764,17 +13725,16 @@ pub unsafe fn _mm512_mask_cvt_roundpd_epu32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epu32&expand=1323)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2udq, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundpd_epu32(k: __mmask8, a: __m512d, rounding: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundpd_epu32<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m512d,
+) -> __m256i {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_u32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2udq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2udq(a, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -13790,17 +13750,13 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_epu32(k: __mmask8, a: __m512d, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ps&expand=1327)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundpd_ps(a: __m512d, rounding: i32) -> __m256 {
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundpd_ps<const ROUNDING: i32>(a: __m512d) -> __m256 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_ps().as_f32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2ps(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2ps(a, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -13816,22 +13772,17 @@ pub unsafe fn _mm512_cvt_roundpd_ps(a: __m512d, rounding: i32) -> __m256 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ps&expand=1328)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundpd_ps(
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundpd_ps<const ROUNDING: i32>(
     src: __m256,
     k: __mmask8,
     a: __m512d,
-    rounding: i32,
 ) -> __m256 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let src = src.as_f32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2ps(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2ps(a, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -13847,17 +13798,13 @@ pub unsafe fn _mm512_mask_cvt_roundpd_ps(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ps&expand=1329)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtpd2ps, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundpd_ps(k: __mmask8, a: __m512d, rounding: i32) -> __m256 {
+#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundpd_ps<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_ps().as_f32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtpd2ps(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtpd2ps(a, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -13873,16 +13820,12 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_ps(k: __mmask8, a: __m512d, rounding: i32
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ps&expand=1294)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundepi32_ps(a: __m512i, rounding: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundepi32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtdq2ps(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtdq2ps(a, ROUNDING);
     transmute(r)
 }
 
@@ -13898,21 +13841,16 @@ pub unsafe fn _mm512_cvt_roundepi32_ps(a: __m512i, rounding: i32) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ps&expand=1295)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundepi32_ps(
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundepi32_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512i,
-    rounding: i32,
 ) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtdq2ps(a, $imm4)
-        };
-    }
-    let r: f32x16 = constify_imm4_round!(rounding, call);
+    let r = vcvtdq2ps(a, ROUNDING);
     transmute(simd_select_bitmask(k, r, src.as_f32x16()))
 }
 
@@ -13928,16 +13866,15 @@ pub unsafe fn _mm512_mask_cvt_roundepi32_ps(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ps&expand=1296)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtdq2ps, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundepi32_ps(k: __mmask16, a: __m512i, rounding: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundepi32_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtdq2ps(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtdq2ps(a, ROUNDING);
     let zero = _mm512_setzero_ps().as_f32x16();
     transmute(simd_select_bitmask(k, r, zero))
 }
@@ -13954,16 +13891,12 @@ pub unsafe fn _mm512_maskz_cvt_roundepi32_ps(k: __mmask16, a: __m512i, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ps&expand=1303)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundepu32_ps(a: __m512i, rounding: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundepu32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtudq2ps(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtudq2ps(a, ROUNDING);
     transmute(r)
 }
 
@@ -13979,21 +13912,16 @@ pub unsafe fn _mm512_cvt_roundepu32_ps(a: __m512i, rounding: i32) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ps&expand=1304)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundepu32_ps(
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundepu32_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512i,
-    rounding: i32,
 ) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtudq2ps(a, $imm4)
-        };
-    }
-    let r: f32x16 = constify_imm4_round!(rounding, call);
+    let r = vcvtudq2ps(a, ROUNDING);
     transmute(simd_select_bitmask(k, r, src.as_f32x16()))
 }
 
@@ -14009,16 +13937,15 @@ pub unsafe fn _mm512_mask_cvt_roundepu32_ps(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ps&expand=1305)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtudq2ps, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundepu32_ps(k: __mmask16, a: __m512i, rounding: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(
+    k: __mmask16,
+    a: __m512i,
+) -> __m512 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtudq2ps(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtudq2ps(a, ROUNDING);
     let zero = _mm512_setzero_ps().as_f32x16();
     transmute(simd_select_bitmask(k, r, zero))
 }
@@ -33519,18 +33446,15 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33545,24 +33469,20 @@ pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_add_round_ss(
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_add_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddss(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33577,18 +33497,19 @@ pub unsafe fn _mm_mask_add_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_add_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddss(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -33603,18 +33524,15 @@ pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -33629,24 +33547,20 @@ pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_add_round_sd(
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_add_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddsd(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -33661,23 +33575,19 @@ pub unsafe fn _mm_mask_add_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_add_round_sd(
+#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_add_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vaddsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vaddsd(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33692,18 +33602,15 @@ pub unsafe fn _mm_maskz_add_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33718,24 +33625,20 @@ pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sub_round_ss(
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubss(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33750,18 +33653,19 @@ pub unsafe fn _mm_mask_sub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubss(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -33776,18 +33680,15 @@ pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -33802,24 +33703,20 @@ pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sub_round_sd(
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubsd(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -33834,23 +33731,19 @@ pub unsafe fn _mm_mask_sub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sub_round_sd(
+#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsubsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsubsd(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33865,18 +33758,15 @@ pub unsafe fn _mm_maskz_sub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33891,24 +33781,20 @@ pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_mul_round_ss(
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulss(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -33923,18 +33809,19 @@ pub unsafe fn _mm_mask_mul_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulss(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -33949,18 +33836,15 @@ pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -33975,24 +33859,20 @@ pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_mul_round_sd(
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulsd(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34007,23 +33887,19 @@ pub unsafe fn _mm_mask_mul_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_mul_round_sd(
+#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmulsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vmulsd(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34038,18 +33914,15 @@ pub unsafe fn _mm_maskz_mul_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34064,24 +33937,20 @@ pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_div_round_ss(
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_div_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivss(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34096,18 +33965,19 @@ pub unsafe fn _mm_mask_div_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_div_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivss(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -34122,18 +33992,15 @@ pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34148,24 +34015,20 @@ pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_div_round_sd(
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_div_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivsd(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34180,23 +34043,19 @@ pub unsafe fn _mm_mask_div_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_div_round_sd(
+#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_div_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vdivsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vdivsd(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34475,18 +34334,15 @@ pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtss(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34501,24 +34357,20 @@ pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sqrt_round_ss(
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtss(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34533,18 +34385,19 @@ pub unsafe fn _mm_mask_sqrt_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtss(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -34559,18 +34412,15 @@ pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, roundin
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtsd(a, b, zero, 0b1, ROUNDING);
+    transmute(r)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34585,24 +34435,20 @@ pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sqrt_round_sd(
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtsd(a, b, src, k, ROUNDING);
+    transmute(r)
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34617,23 +34463,19 @@ pub unsafe fn _mm_mask_sqrt_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sqrt_round_sd(
+#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = vsqrtsd(a, b, zero, k, ROUNDING);
+    transmute(r)
 }
 
 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
@@ -35196,18 +35038,14 @@ pub unsafe fn _mm_maskz_roundscale_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_ss&expand=4895)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_scalef_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_scalef_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefss(a, b, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefss(a, b, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -35223,24 +35061,19 @@ pub unsafe fn _mm_scalef_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_ss&expand=4893)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_scalef_round_ss(
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_scalef_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefss(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefss(a, b, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -35256,23 +35089,18 @@ pub unsafe fn _mm_mask_scalef_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_ss&expand=4894)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_scalef_round_ss(
+#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_scalef_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefss(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefss(a, b, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -35288,18 +35116,14 @@ pub unsafe fn _mm_maskz_scalef_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_sd&expand=4892)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_scalef_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_scalef_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefsd(a, b, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefsd(a, b, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -35315,24 +35139,18 @@ pub unsafe fn _mm_scalef_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m1
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_sd&expand=4890)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_scalef_round_sd(
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_scalef_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefsd(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefsd(a, b, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -35348,23 +35166,18 @@ pub unsafe fn _mm_mask_scalef_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_sd&expand=4891)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_scalef_round_sd(
+#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vscalefsd(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vscalefsd(a, b, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -35380,19 +35193,15 @@ pub unsafe fn _mm_maskz_scalef_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_ss&expand=2573)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let extracta: f32 = simd_extract(a, 0);
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132ss(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fmadd = constify_imm4_round!(rounding, call);
-    let r = simd_insert(a, 0, fmadd);
+    let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
+    let r = simd_insert(a, 0, r);
     transmute(r)
 }
 
@@ -35408,25 +35217,20 @@ pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_ss&expand=2574)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f32 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(fmadd, extractb, extractc, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmadd);
     transmute(r)
@@ -35444,26 +35248,21 @@ pub unsafe fn _mm_mask_fmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_ss&expand=2576)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmadd);
     transmute(r)
@@ -35481,25 +35280,20 @@ pub unsafe fn _mm_maskz_fmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_ss&expand=2575)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f32 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, fmadd, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING);
     }
     let r = simd_insert(c, 0, fmadd);
     transmute(r)
@@ -35517,18 +35311,18 @@ pub unsafe fn _mm_mask3_fmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_sd&expand=2569)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let extracta: f64 = simd_extract(a, 0);
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132sd(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fmadd = constify_imm4_round!(rounding, call);
+    let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fmadd);
     transmute(r)
 }
@@ -35545,25 +35339,20 @@ pub unsafe fn _mm_fmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_sd&expand=2570)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f64 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(fmadd, extractb, extractc, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmadd);
     transmute(r)
@@ -35581,26 +35370,21 @@ pub unsafe fn _mm_mask_fmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_sd&expand=2572)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmadd);
     transmute(r)
@@ -35618,25 +35402,20 @@ pub unsafe fn _mm_maskz_fmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_Sd&expand=2571)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmadd: f64 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, fmadd, $imm4)
-            };
-        }
-        fmadd = constify_imm4_round!(rounding, call);
+        fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING);
     }
     let r = simd_insert(c, 0, fmadd);
     transmute(r)
@@ -35654,19 +35433,15 @@ pub unsafe fn _mm_mask3_fmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmsub_round_ss&expand=2659)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fmsub_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let extracta: f32 = simd_extract(a, 0);
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
     let extractc = -extractc;
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132ss(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fmsub = constify_imm4_round!(rounding, call);
+    let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
 }
@@ -35683,26 +35458,21 @@ pub unsafe fn _mm_fmsub_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_round_ss&expand=2660)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f32 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(fmsub, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
@@ -35720,27 +35490,22 @@ pub unsafe fn _mm_mask_fmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_round_ss&expand=2662)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
@@ -35758,26 +35523,21 @@ pub unsafe fn _mm_maskz_fmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_round_ss&expand=2661)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f32 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extractb: f32 = simd_extract(b, 0);
         let extractc = -fmsub;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(c, 0, fmsub);
     transmute(r)
@@ -35795,19 +35555,19 @@ pub unsafe fn _mm_mask3_fmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmsub_round_sd&expand=2655)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fmsub_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let extracta: f64 = simd_extract(a, 0);
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
     let extractc = -extractc;
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132sd(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fmsub = constify_imm4_round!(rounding, call);
+    let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
 }
@@ -35824,26 +35584,21 @@ pub unsafe fn _mm_fmsub_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_round_sd&expand=2656)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f64 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(fmsub, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
@@ -35861,27 +35616,22 @@ pub unsafe fn _mm_mask_fmsub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_round_sd&expand=2658)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fmsub);
     transmute(r)
@@ -35899,26 +35649,21 @@ pub unsafe fn _mm_maskz_fmsub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_round_sd&expand=2657)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fmsub: f64 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extractb: f64 = simd_extract(b, 0);
         let extractc = -fmsub;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fmsub = constify_imm4_round!(rounding, call);
+        fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(c, 0, fmsub);
     transmute(r)
@@ -35936,19 +35681,15 @@ pub unsafe fn _mm_mask3_fmsub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmadd_round_ss&expand=2739)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fnmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let extracta: f32 = simd_extract(a, 0);
     let extracta = -extracta;
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132ss(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fnmadd = constify_imm4_round!(rounding, call);
+    let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
 }
@@ -35965,26 +35706,21 @@ pub unsafe fn _mm_fnmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_round_ss&expand=2740)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fnmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f32 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmadd;
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
@@ -36002,27 +35738,22 @@ pub unsafe fn _mm_mask_fnmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_round_ss&expand=2742)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fnmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
@@ -36040,26 +35771,21 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_round_ss&expand=2741)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fnmadd_round_ss(
+#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f32 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f32 = simd_extract(b, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, fnmadd, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING);
     }
     let r = simd_insert(c, 0, fnmadd);
     transmute(r)
@@ -36077,19 +35803,19 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmadd_round_sd&expand=2735)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fnmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let extracta: f64 = simd_extract(a, 0);
     let extracta = -extracta;
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132sd(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fnmadd = constify_imm4_round!(rounding, call);
+    let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
 }
@@ -36106,26 +35832,21 @@ pub unsafe fn _mm_fnmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_round_sd&expand=2736)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fnmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f64 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmadd;
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
@@ -36143,27 +35864,22 @@ pub unsafe fn _mm_mask_fnmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_round_sd&expand=2738)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fnmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmadd);
     transmute(r)
@@ -36181,26 +35897,21 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_round_Sd&expand=2737)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fnmadd_round_sd(
+#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmadd: f64 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f64 = simd_extract(b, 0);
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, fnmadd, $imm4)
-            };
-        }
-        fnmadd = constify_imm4_round!(rounding, call);
+        fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING);
     }
     let r = simd_insert(c, 0, fnmadd);
     transmute(r)
@@ -36218,20 +35929,16 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmsub_round_ss&expand=2787)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fnmsub_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let extracta: f32 = simd_extract(a, 0);
     let extracta = -extracta;
     let extractb: f32 = simd_extract(b, 0);
     let extractc: f32 = simd_extract(c, 0);
     let extractc = -extractc;
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132ss(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fnmsub = constify_imm4_round!(rounding, call);
+    let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
 }
@@ -36248,27 +35955,22 @@ pub unsafe fn _mm_fnmsub_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_round_ss&expand=2788)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fnmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f32 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmsub;
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
@@ -36286,15 +35988,15 @@ pub unsafe fn _mm_mask_fnmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_round_ss&expand=2790)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fnmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f32 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
@@ -36302,12 +36004,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss(
         let extractb: f32 = simd_extract(b, 0);
         let extractc: f32 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
@@ -36325,27 +36022,22 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_round_ss&expand=2789)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fnmsub_round_ss(
+#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f32 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f32 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f32 = simd_extract(b, 0);
         let extractc = -fnmsub;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132ss(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(c, 0, fnmsub);
     transmute(r)
@@ -36363,20 +36055,20 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmsub_round_sd&expand=2783)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_fnmsub_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let extracta: f64 = simd_extract(a, 0);
     let extracta = -extracta;
     let extractb: f64 = simd_extract(b, 0);
     let extractc: f64 = simd_extract(c, 0);
     let extractc = -extractc;
-    macro_rules! call {
-        ($imm4:expr) => {
-            vfmadd132sd(extracta, extractb, extractc, $imm4)
-        };
-    }
-    let fnmsub = constify_imm4_round!(rounding, call);
+    let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
 }
@@ -36393,27 +36085,22 @@ pub unsafe fn _mm_fnmsub_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_round_sd&expand=2784)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_fnmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f64 = simd_extract(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmsub;
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
@@ -36431,15 +36118,15 @@ pub unsafe fn _mm_mask_fnmsub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_round_sd&expand=2786)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_maskz_fnmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f64 = 0.;
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
@@ -36447,12 +36134,7 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd(
         let extractb: f64 = simd_extract(b, 0);
         let extractc: f64 = simd_extract(c, 0);
         let extractc = -extractc;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(a, 0, fnmsub);
     transmute(r)
@@ -36470,27 +36152,22 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_round_sd&expand=2785)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask3_fnmsub_round_sd(
+#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
-    rounding: i32,
 ) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let mut fnmsub: f64 = simd_extract(c, 0);
     if (k & 0b00000001) != 0 {
         let extracta: f64 = simd_extract(a, 0);
         let extracta = -extracta;
         let extractb: f64 = simd_extract(b, 0);
         let extractc = -fnmsub;
-        macro_rules! call {
-            ($imm4:expr) => {
-                vfmadd132sd(extracta, extractb, extractc, $imm4)
-            };
-        }
-        fnmsub = constify_imm4_round!(rounding, call);
+        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
     let r = simd_insert(c, 0, fnmsub);
     transmute(r)
@@ -36977,18 +36654,14 @@ pub unsafe fn _mm_maskz_cvt_roundss_sd(k: __mmask8, a: __m128d, b: __m128, sae:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_ss&expand=1361)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundsd_ss(a: __m128, b: __m128d, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsd_ss<const ROUNDING: i32>(a: __m128, b: __m128d) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f64x2();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2ss(a, b, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2ss(a, b, zero, 0b11111111, ROUNDING);
     transmute(r)
 }
 
@@ -37003,24 +36676,19 @@ pub unsafe fn _mm_cvt_roundsd_ss(a: __m128, b: __m128d, rounding: i32) -> __m128
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundsd_ss&expand=1362)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_cvt_roundsd_ss(
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_cvt_roundsd_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128d,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f64x2();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2ss(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2ss(a, b, src, k, ROUNDING);
     transmute(r)
 }
 
@@ -37035,23 +36703,18 @@ pub unsafe fn _mm_mask_cvt_roundsd_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvt_roundsd_ss&expand=1363)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2ss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_cvt_roundsd_ss(
+#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128d,
-    rounding: i32,
 ) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let b = b.as_f64x2();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2ss(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2ss(a, b, zero, k, ROUNDING);
     transmute(r)
 }
 
@@ -37066,16 +36729,12 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_si32&expand=1374)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_si32(a: __m128, rounding: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2si(a, ROUNDING);
     transmute(r)
 }
 
@@ -37090,16 +36749,12 @@ pub unsafe fn _mm_cvt_roundss_si32(a: __m128, rounding: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_i32&expand=1369)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_i32(a: __m128, rounding: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2si(a, ROUNDING);
     transmute(r)
 }
 
@@ -37114,16 +36769,12 @@ pub unsafe fn _mm_cvt_roundss_i32(a: __m128, rounding: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_u32&expand=1376)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_u32(a: __m128, rounding: i32) -> u32 {
+#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2usi(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2usi(a, ROUNDING);
     transmute(r)
 }
 
@@ -37158,16 +36809,12 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_si32&expand=1359)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_si32(a: __m128d, rounding: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2si(a, ROUNDING);
     transmute(r)
 }
 
@@ -37182,16 +36829,12 @@ pub unsafe fn _mm_cvt_roundsd_si32(a: __m128d, rounding: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_i32&expand=1357)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d, rounding: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2si(a, ROUNDING);
     transmute(r)
 }
 
@@ -37206,16 +36849,12 @@ pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d, rounding: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=cvt_roundsd_u32&expand=1364)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_u32(a: __m128d, rounding: i32) -> u32 {
+#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2usi(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2usi(a, ROUNDING);
     transmute(r)
 }
 
@@ -37251,16 +36890,12 @@ pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundi32_ss&expand=1312)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2ss(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2ss(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -37276,16 +36911,12 @@ pub unsafe fn _mm_cvt_roundi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsi32_ss&expand=1366)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundsi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2ss(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2ss(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -37300,16 +36931,12 @@ pub unsafe fn _mm_cvt_roundsi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundu32_ss&expand=1378)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtusi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundu32_ss(a: __m128, b: u32, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtusi2ss(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtusi2ss(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -44343,10 +43970,10 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
         assert_eq_m512i(r, e);
-        let r = _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
@@ -44357,14 +43984,14 @@ mod tests {
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
         let src = _mm512_set1_epi32(0);
-        let r =
-            _mm512_mask_cvt_roundps_epi32(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvt_roundps_epi32(
+        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             src,
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
@@ -44375,12 +44002,13 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_maskz_cvt_roundps_epi32(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvt_roundps_epi32(
+        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
@@ -44391,10 +44019,10 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_cvt_roundps_epu32(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
         assert_eq_m512i(r, e);
-        let r = _mm512_cvt_roundps_epu32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
@@ -44405,14 +44033,14 @@ mod tests {
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
         let src = _mm512_set1_epi32(0);
-        let r =
-            _mm512_mask_cvt_roundps_epu32(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvt_roundps_epu32(
+        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             src,
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
@@ -44423,12 +44051,13 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_maskz_cvt_roundps_epu32(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvt_roundps_epu32(
+        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
@@ -44437,7 +44066,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundepi32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_cvt_roundepi32_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
         let e = _mm512_setr_ps(
             0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16.,
         );
@@ -44448,14 +44077,14 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundepi32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
         let src = _mm512_set1_ps(0.);
-        let r =
-            _mm512_mask_cvt_roundepi32_ps(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
         assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundepi32_ps(
+        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             src,
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_ps(
             0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
@@ -44466,12 +44095,13 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundepi32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_maskz_cvt_roundepi32_ps(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
         assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundepi32_ps(
+        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         let e = _mm512_setr_ps(
             0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
@@ -44482,7 +44112,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundepu32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_cvt_roundepu32_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
         #[rustfmt::skip]
         let e = _mm512_setr_ps(
             0., 4294967300., 2., 4294967300.,
@@ -44497,14 +44127,14 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundepu32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
         let src = _mm512_set1_ps(0.);
-        let r =
-            _mm512_mask_cvt_roundepu32_ps(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            src, 0, a,
+        );
         assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundepu32_ps(
+        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             src,
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         #[rustfmt::skip]
         let e = _mm512_setr_ps(
@@ -44519,12 +44149,13 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundepu32_ps() {
         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
-        let r = _mm512_maskz_cvt_roundepu32_ps(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a,
+        );
         assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundepu32_ps(
+        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
             0b00000000_11111111,
             a,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
         );
         #[rustfmt::skip]
         let e = _mm512_setr_ps(
@@ -52671,7 +52302,7 @@ mod tests {
     unsafe fn test_mm_add_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_add_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 2., 10., 60.);
         assert_eq_m128(r, e);
     }
@@ -52681,15 +52312,11 @@ mod tests {
         let src = _mm_set_ps(10., 11., 100., 110.);
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_add_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_ps(1., 2., 10., 110.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_add_round_ss(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 2., 10., 60.);
         assert_eq_m128(r, e);
@@ -52699,10 +52326,11 @@ mod tests {
     unsafe fn test_mm_maskz_add_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_add_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_add_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_ps(1., 2., 10., 60.);
         assert_eq_m128(r, e);
     }
@@ -52711,7 +52339,7 @@ mod tests {
     unsafe fn test_mm_add_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_add_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 6.);
         assert_eq_m128d(r, e);
     }
@@ -52721,15 +52349,11 @@ mod tests {
         let src = _mm_set_pd(10., 11.);
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_add_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_pd(1., 11.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_add_round_sd(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 6.);
         assert_eq_m128d(r, e);
@@ -52739,10 +52363,11 @@ mod tests {
     unsafe fn test_mm_maskz_add_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_add_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_add_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_pd(1., 6.);
         assert_eq_m128d(r, e);
     }
@@ -52751,7 +52376,7 @@ mod tests {
     unsafe fn test_mm_sub_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_sub_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 2., 10., -20.);
         assert_eq_m128(r, e);
     }
@@ -52761,15 +52386,11 @@ mod tests {
         let src = _mm_set_ps(10., 11., 100., 110.);
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_sub_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_ps(1., 2., 10., 110.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_sub_round_ss(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 2., 10., -20.);
         assert_eq_m128(r, e);
@@ -52779,10 +52400,11 @@ mod tests {
     unsafe fn test_mm_maskz_sub_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_sub_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_sub_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_ps(1., 2., 10., -20.);
         assert_eq_m128(r, e);
     }
@@ -52791,7 +52413,7 @@ mod tests {
     unsafe fn test_mm_sub_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_sub_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., -2.);
         assert_eq_m128d(r, e);
     }
@@ -52801,15 +52423,11 @@ mod tests {
         let src = _mm_set_pd(10., 11.);
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_sub_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_pd(1., 11.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_sub_round_sd(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., -2.);
         assert_eq_m128d(r, e);
@@ -52819,10 +52437,11 @@ mod tests {
     unsafe fn test_mm_maskz_sub_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_sub_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_sub_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_pd(1., -2.);
         assert_eq_m128d(r, e);
     }
@@ -52831,7 +52450,7 @@ mod tests {
     unsafe fn test_mm_mul_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mul_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 2., 10., 800.);
         assert_eq_m128(r, e);
     }
@@ -52841,15 +52460,11 @@ mod tests {
         let src = _mm_set_ps(10., 11., 100., 110.);
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_mul_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_ps(1., 2., 10., 110.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_mul_round_ss(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 2., 10., 800.);
         assert_eq_m128(r, e);
@@ -52859,10 +52474,11 @@ mod tests {
     unsafe fn test_mm_maskz_mul_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_mul_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_mul_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_ps(1., 2., 10., 800.);
         assert_eq_m128(r, e);
     }
@@ -52871,7 +52487,7 @@ mod tests {
     unsafe fn test_mm_mul_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mul_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
     }
@@ -52881,15 +52497,11 @@ mod tests {
         let src = _mm_set_pd(10., 11.);
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_mul_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_pd(1., 11.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_mul_round_sd(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
@@ -52899,10 +52511,11 @@ mod tests {
     unsafe fn test_mm_maskz_mul_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_mul_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_mul_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
     }
@@ -52911,7 +52524,7 @@ mod tests {
     unsafe fn test_mm_div_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_div_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 2., 10., 0.5);
         assert_eq_m128(r, e);
     }
@@ -52921,15 +52534,11 @@ mod tests {
         let src = _mm_set_ps(10., 11., 100., 110.);
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_mask_div_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_ps(1., 2., 10., 110.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_div_round_ss(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 2., 10., 0.5);
         assert_eq_m128(r, e);
@@ -52939,10 +52548,11 @@ mod tests {
     unsafe fn test_mm_maskz_div_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 40.);
-        let r = _mm_maskz_div_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_div_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.5);
         assert_eq_m128(r, e);
     }
@@ -52951,7 +52561,7 @@ mod tests {
     unsafe fn test_mm_div_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_div_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 0.5);
         assert_eq_m128d(r, e);
     }
@@ -52961,15 +52571,11 @@ mod tests {
         let src = _mm_set_pd(10., 11.);
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_div_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_pd(1., 11.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_div_round_sd(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 0.5);
         assert_eq_m128d(r, e);
@@ -52979,10 +52585,11 @@ mod tests {
     unsafe fn test_mm_maskz_div_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_div_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_div_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_pd(1., 0.5);
         assert_eq_m128d(r, e);
     }
@@ -53123,7 +52730,7 @@ mod tests {
     unsafe fn test_mm_sqrt_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_sqrt_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 2., 10., 2.);
         assert_eq_m128(r, e);
     }
@@ -53133,15 +52740,11 @@ mod tests {
         let src = _mm_set_ps(10., 11., 100., 110.);
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_mask_sqrt_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_ps(1., 2., 10., 110.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_sqrt_round_ss(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 2., 10., 2.);
         assert_eq_m128(r, e);
@@ -53151,10 +52754,11 @@ mod tests {
     unsafe fn test_mm_maskz_sqrt_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
         let b = _mm_set_ps(3., 4., 30., 4.);
-        let r = _mm_maskz_sqrt_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 2., 10., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_sqrt_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_ps(1., 2., 10., 2.);
         assert_eq_m128(r, e);
     }
@@ -53163,7 +52767,7 @@ mod tests {
     unsafe fn test_mm_sqrt_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_sqrt_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 2.);
         assert_eq_m128d(r, e);
     }
@@ -53173,15 +52777,11 @@ mod tests {
         let src = _mm_set_pd(10., 11.);
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_mask_sqrt_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
         let e = _mm_set_pd(1., 11.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_sqrt_round_sd(
-            src,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            src, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 2.);
         assert_eq_m128d(r, e);
@@ -53191,10 +52791,11 @@ mod tests {
     unsafe fn test_mm_maskz_sqrt_round_sd() {
         let a = _mm_set_pd(1., 2.);
         let b = _mm_set_pd(3., 4.);
-        let r = _mm_maskz_sqrt_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_sqrt_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
         let e = _mm_set_pd(1., 2.);
         assert_eq_m128d(r, e);
     }
@@ -53473,7 +53074,7 @@ mod tests {
     unsafe fn test_mm_scalef_round_ss() {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_scalef_round_ss(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(1., 1., 1., 8.);
         assert_eq_m128(r, e);
     }
@@ -53482,15 +53083,13 @@ mod tests {
     unsafe fn test_mm_mask_scalef_round_ss() {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_mask_scalef_round_ss(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
         let e = _mm_set_ps(1., 1., 1., 1.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_scalef_round_ss(
-            a,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 1., 1., 8.);
         assert_eq_m128(r, e);
@@ -53500,14 +53099,12 @@ mod tests {
     unsafe fn test_mm_maskz_scalef_round_ss() {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_maskz_scalef_round_ss(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(1., 1., 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_scalef_round_ss(
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
         );
         let e = _mm_set_ps(1., 1., 1., 8.);
         assert_eq_m128(r, e);
@@ -53517,7 +53114,7 @@ mod tests {
     unsafe fn test_mm_scalef_round_sd() {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_scalef_round_sd(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
     }
@@ -53526,15 +53123,13 @@ mod tests {
     unsafe fn test_mm_mask_scalef_round_sd() {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_mask_scalef_round_sd(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, a, b,
+        );
         let e = _mm_set_pd(1., 1.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_scalef_round_sd(
-            a,
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
@@ -53544,14 +53139,12 @@ mod tests {
     unsafe fn test_mm_maskz_scalef_round_sd() {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_maskz_scalef_round_sd(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r =
+            _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_scalef_round_sd(
-            0b11111111,
-            a,
-            b,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
         );
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
@@ -53562,7 +53155,7 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_fmadd_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_ps(1., 1., 1., 5.);
         assert_eq_m128(r, e);
     }
@@ -53572,14 +53165,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fmadd_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128(r, a);
-        let r = _mm_mask_fmadd_round_ss(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., 5.);
         assert_eq_m128(r, e);
@@ -53590,15 +53181,13 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fmadd_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_ps(1., 1., 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_fmadd_round_ss(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., 5.);
         assert_eq_m128(r, e);
@@ -53609,14 +53198,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fmadd_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128(r, c);
-        let r = _mm_mask3_fmadd_round_ss(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_ps(3., 3., 3., 5.);
         assert_eq_m128(r, e);
@@ -53627,7 +53214,7 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_fmadd_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_pd(1., 5.);
         assert_eq_m128d(r, e);
     }
@@ -53637,14 +53224,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fmadd_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128d(r, a);
-        let r = _mm_mask_fmadd_round_sd(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_pd(1., 5.);
         assert_eq_m128d(r, e);
@@ -53655,15 +53240,13 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fmadd_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_fmadd_round_sd(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_pd(1., 5.);
         assert_eq_m128d(r, e);
@@ -53674,14 +53257,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fmadd_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128d(r, c);
-        let r = _mm_mask3_fmadd_round_sd(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_pd(3., 5.);
         assert_eq_m128d(r, e);
@@ -53692,7 +53273,7 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_fmsub_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_ps(1., 1., 1., -1.);
         assert_eq_m128(r, e);
     }
@@ -53702,14 +53283,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fmsub_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128(r, a);
-        let r = _mm_mask_fmsub_round_ss(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., -1.);
         assert_eq_m128(r, e);
@@ -53720,15 +53299,13 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_maskz_fmsub_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_ps(1., 1., 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_fmsub_round_ss(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., -1.);
         assert_eq_m128(r, e);
@@ -53739,14 +53316,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask3_fmsub_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128(r, c);
-        let r = _mm_mask3_fmsub_round_ss(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_ps(3., 3., 3., -1.);
         assert_eq_m128(r, e);
@@ -53757,7 +53332,7 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_fmsub_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_pd(1., -1.);
         assert_eq_m128d(r, e);
     }
@@ -53767,14 +53342,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fmsub_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128d(r, a);
-        let r = _mm_mask_fmsub_round_sd(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_pd(1., -1.);
         assert_eq_m128d(r, e);
@@ -53785,15 +53358,13 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_maskz_fmsub_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_fmsub_round_sd(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_pd(1., -1.);
         assert_eq_m128d(r, e);
@@ -53804,14 +53375,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask3_fmsub_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128d(r, c);
-        let r = _mm_mask3_fmsub_round_sd(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_pd(3., -1.);
         assert_eq_m128d(r, e);
@@ -53822,7 +53391,7 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_fnmadd_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_ps(1., 1., 1., 1.);
         assert_eq_m128(r, e);
     }
@@ -53832,14 +53401,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fnmadd_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128(r, a);
-        let r = _mm_mask_fnmadd_round_ss(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., 1.);
         assert_eq_m128(r, e);
@@ -53850,16 +53417,13 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r =
-            _mm_maskz_fnmadd_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_ps(1., 1., 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_fnmadd_round_ss(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., 1.);
         assert_eq_m128(r, e);
@@ -53870,15 +53434,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r =
-            _mm_mask3_fnmadd_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128(r, c);
-        let r = _mm_mask3_fnmadd_round_ss(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_ps(3., 3., 3., 1.);
         assert_eq_m128(r, e);
@@ -53889,7 +53450,7 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_fnmadd_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_pd(1., 1.);
         assert_eq_m128d(r, e);
     }
@@ -53899,14 +53460,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fnmadd_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128d(r, a);
-        let r = _mm_mask_fnmadd_round_sd(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_pd(1., 1.);
         assert_eq_m128d(r, e);
@@ -53917,16 +53476,13 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r =
-            _mm_maskz_fnmadd_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_fnmadd_round_sd(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_pd(1., 1.);
         assert_eq_m128d(r, e);
@@ -53937,15 +53493,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r =
-            _mm_mask3_fnmadd_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128d(r, c);
-        let r = _mm_mask3_fnmadd_round_sd(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_pd(3., 1.);
         assert_eq_m128d(r, e);
@@ -53956,7 +53509,7 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_fnmsub_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_ps(1., 1., 1., -5.);
         assert_eq_m128(r, e);
     }
@@ -53966,14 +53519,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r = _mm_mask_fnmsub_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128(r, a);
-        let r = _mm_mask_fnmsub_round_ss(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., -5.);
         assert_eq_m128(r, e);
@@ -53984,16 +53535,13 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r =
-            _mm_maskz_fnmsub_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_ps(1., 1., 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_fnmsub_round_ss(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_ps(1., 1., 1., -5.);
         assert_eq_m128(r, e);
@@ -54004,15 +53552,12 @@ mod tests {
         let a = _mm_set1_ps(1.);
         let b = _mm_set1_ps(2.);
         let c = _mm_set1_ps(3.);
-        let r =
-            _mm_mask3_fnmsub_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128(r, c);
-        let r = _mm_mask3_fnmsub_round_ss(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_ps(3., 3., 3., -5.);
         assert_eq_m128(r, e);
@@ -54023,7 +53568,7 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_fnmsub_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
         let e = _mm_set_pd(1., -5.);
         assert_eq_m128d(r, e);
     }
@@ -54033,14 +53578,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r = _mm_mask_fnmsub_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0, b, c,
+        );
         assert_eq_m128d(r, a);
-        let r = _mm_mask_fnmsub_round_sd(
-            a,
-            0b11111111,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, b, c,
         );
         let e = _mm_set_pd(1., -5.);
         assert_eq_m128d(r, e);
@@ -54051,16 +53594,13 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r =
-            _mm_maskz_fnmsub_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0, a, b, c,
+        );
         let e = _mm_set_pd(1., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_fnmsub_round_sd(
-            0b11111111,
-            a,
-            b,
-            c,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b, c,
         );
         let e = _mm_set_pd(1., -5.);
         assert_eq_m128d(r, e);
@@ -54071,15 +53611,12 @@ mod tests {
         let a = _mm_set1_pd(1.);
         let b = _mm_set1_pd(2.);
         let c = _mm_set1_pd(3.);
-        let r =
-            _mm_mask3_fnmsub_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0,
+        );
         assert_eq_m128d(r, c);
-        let r = _mm_mask3_fnmsub_round_sd(
-            a,
-            b,
-            c,
-            0b11111111,
-            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
+            a, b, c, 0b11111111,
         );
         let e = _mm_set_pd(3., -5.);
         assert_eq_m128d(r, e);
@@ -54299,7 +53836,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundsd_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b = _mm_set_pd(6., -7.5);
-        let r = _mm_cvt_roundsd_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., -7.5);
         assert_eq_m128(r, e);
     }
@@ -54308,10 +53845,11 @@ mod tests {
     unsafe fn test_mm_mask_cvt_roundsd_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b = _mm_set_pd(6., -7.5);
-        let r = _mm_mask_cvt_roundsd_ss(a, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
         assert_eq_m128(r, a);
-        let r =
-            _mm_mask_cvt_roundsd_ss(a, 0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            a, 0b11111111, a, b,
+        );
         let e = _mm_set_ps(0., -0.5, 1., -7.5);
         assert_eq_m128(r, e);
     }
@@ -54320,10 +53858,12 @@ mod tests {
     unsafe fn test_mm_maskz_cvt_roundsd_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b = _mm_set_pd(6., -7.5);
-        let r = _mm_maskz_cvt_roundsd_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
         let e = _mm_set_ps(0., -0.5, 1., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_cvt_roundsd_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
+            0b11111111, a, b,
+        );
         let e = _mm_set_ps(0., -0.5, 1., -7.5);
         assert_eq_m128(r, e);
     }
@@ -54331,7 +53871,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_si32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_si32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i32 = -1;
         assert_eq!(r, e);
     }
@@ -54339,7 +53879,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_i32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_i32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i32 = -1;
         assert_eq!(r, e);
     }
@@ -54347,7 +53887,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_u32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: u32 = u32::MAX;
         assert_eq!(r, e);
     }
@@ -54371,7 +53911,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_si32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_si32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i32 = -1;
         assert_eq!(r, e);
     }
@@ -54379,7 +53919,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_i32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_i32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i32 = -1;
         assert_eq!(r, e);
     }
@@ -54387,7 +53927,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_u32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_u32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: u32 = u32::MAX;
         assert_eq!(r, e);
     }
@@ -54412,7 +53952,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundi32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: i32 = 9;
-        let r = _mm_cvt_roundi32_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
@@ -54421,7 +53961,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundsi32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: i32 = 9;
-        let r = _mm_cvt_roundsi32_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
@@ -54430,7 +53970,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundu32_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: u32 = 9;
-        let r = _mm_cvt_roundu32_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundu32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index caaf3e6d73..ae6202bc73 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -6288,7 +6288,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundpd_ps() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvt_roundpd_ps(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(a);
         let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         assert_eq_m256(r, e);
     }
@@ -6297,9 +6297,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundpd_ps() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm256_set1_ps(0.);
-        let r = _mm512_mask_cvt_roundpd_ps(src, 0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
         assert_eq_m256(r, src);
-        let r = _mm512_mask_cvt_roundpd_ps(src, 0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
         let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
         assert_eq_m256(r, e);
     }
@@ -6307,9 +6307,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundpd_ps() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvt_roundpd_ps(0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(0, a);
         assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm512_maskz_cvt_roundpd_ps(0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_ps::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
         let e = _mm256_setr_ps(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
         assert_eq_m256(r, e);
     }
@@ -6317,7 +6317,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvt_roundpd_epi32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e = _mm256_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8);
         assert_eq_m256i(r, e);
     }
@@ -6326,9 +6326,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm256_set1_epi32(0);
-        let r = _mm512_mask_cvt_roundpd_epi32(src, 0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvt_roundpd_epi32(src, 0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
         let e = _mm256_setr_epi32(0, -2, 2, -4, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -6336,9 +6336,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvt_roundpd_epi32(0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvt_roundpd_epi32(0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_epi32::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
         let e = _mm256_setr_epi32(0, -2, 2, -4, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -6346,7 +6346,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvt_roundpd_epu32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1);
         assert_eq_m256i(r, e);
     }
@@ -6355,9 +6355,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm256_set1_epi32(0);
-        let r = _mm512_mask_cvt_roundpd_epu32(src, 0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvt_roundpd_epu32(src, 0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -6365,9 +6365,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvt_roundpd_epu32(0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvt_roundpd_epu32(0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundpd_epu32::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }

From 76acb956f5677458533530ae9db84dacda26bf8a Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 15:46:38 +0000
Subject: [PATCH 019/123] cvt_roundps_pd; cvt_roundps_ph; cvt_roundph_ps;
 cvtps_ph; cvtt_roundps,pd_epi32,epu32; mm_max,min_round_ss,sd;
 mm_getexp_ss,sd; mm_cvt_roundss_sd; cvt_roundss_si32,i32,u32;
 mm_cvtt_roundsd_si32,i32,u32

---
 crates/core_arch/src/x86/avx512f.rs    | 905 ++++++++++---------------
 crates/core_arch/src/x86_64/avx512f.rs |  30 +-
 2 files changed, 375 insertions(+), 560 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7911157eb2..bcd826d700 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -13533,17 +13533,13 @@ pub unsafe fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_pd&expand=1347)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2pd, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundps_pd(a: __m256, sae: i32) -> __m512d {
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_pd<const SAE: i32>(a: __m256) -> __m512d {
+    static_assert_sae!(SAE);
     let a = a.as_f32x8();
     let zero = _mm512_setzero_pd().as_f64x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2pd(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2pd(a, zero, 0b11111111, SAE);
     transmute(r)
 }
 
@@ -13553,22 +13549,17 @@ pub unsafe fn _mm512_cvt_roundps_pd(a: __m256, sae: i32) -> __m512d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2pd, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_pd(
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_pd<const SAE: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m256,
-    sae: i32,
 ) -> __m512d {
+    static_assert_sae!(SAE);
     let a = a.as_f32x8();
     let src = src.as_f64x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2pd(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2pd(a, src, k, SAE);
     transmute(r)
 }
 
@@ -13578,17 +13569,13 @@ pub unsafe fn _mm512_mask_cvt_roundps_pd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2pd, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_pd(k: __mmask8, a: __m256, sae: i32) -> __m512d {
+#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_pd<const SAE: i32>(k: __mmask8, a: __m256) -> __m512d {
+    static_assert_sae!(SAE);
     let a = a.as_f32x8();
     let zero = _mm512_setzero_pd().as_f64x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2pd(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2pd(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -13956,17 +13943,13 @@ pub unsafe fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_ph&expand=1354)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundps_ph(a: __m512, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundps_ph<const SAE: i32>(a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm256_setzero_si256().as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, zero, 0b11111111_11111111)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, zero, 0b11111111_11111111);
     transmute(r)
 }
 
@@ -13976,22 +13959,17 @@ pub unsafe fn _mm512_cvt_roundps_ph(a: __m512, sae: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_ph&expand=1355)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_ph(
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundps_ph<const SAE: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m512,
-    sae: i32,
 ) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let src = src.as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, src, k)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, src, k);
     transmute(r)
 }
 
@@ -14001,17 +13979,13 @@ pub unsafe fn _mm512_mask_cvt_roundps_ph(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_ph&expand=1356)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_ph(k: __mmask16, a: __m512, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm256_setzero_si256().as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, zero, k)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, zero, k);
     transmute(r)
 }
 
@@ -14126,17 +14100,13 @@ pub unsafe fn _mm_maskz_cvt_roundps_ph(k: __mmask8, a: __m128, imm8: i32) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_ph&expand=1778)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvtps_ph(a: __m512, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtps_ph<const SAE: i32>(a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm256_setzero_si256().as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, zero, 0b11111111_11111111)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, zero, 0b11111111_11111111);
     transmute(r)
 }
 
@@ -14146,17 +14116,17 @@ pub unsafe fn _mm512_cvtps_ph(a: __m512, sae: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtps_ph&expand=1779)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvtps_ph(src: __m256i, k: __mmask16, a: __m512, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtps_ph<const SAE: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m512,
+) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let src = src.as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, src, k)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, src, k);
     transmute(r)
 }
 
@@ -14166,17 +14136,13 @@ pub unsafe fn _mm512_mask_cvtps_ph(src: __m256i, k: __mmask16, a: __m512, sae: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_ph&expand=1780)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvtps_ph(k: __mmask16, a: __m512, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm256_setzero_si256().as_i16x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtps2ph(a, $imm4, zero, k)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtps2ph(a, SAE, zero, k);
     transmute(r)
 }
 
@@ -14286,17 +14252,13 @@ pub unsafe fn _mm_maskz_cvtps_ph(k: __mmask8, a: __m128, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_ps&expand=1332)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtph2ps, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvt_roundph_ps(a: __m256i, sae: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvt_roundph_ps<const SAE: i32>(a: __m256i) -> __m512 {
+    static_assert_sae!(SAE);
     let a = a.as_i16x16();
     let zero = _mm512_setzero_ps().as_f32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtph2ps(a, zero, 0b11111111_11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtph2ps(a, zero, 0b11111111_11111111, SAE);
     transmute(r)
 }
 
@@ -14306,22 +14268,17 @@ pub unsafe fn _mm512_cvt_roundph_ps(a: __m256i, sae: i32) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_ps&expand=1333)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtph2ps, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvt_roundph_ps(
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvt_roundph_ps<const SAE: i32>(
     src: __m512,
     k: __mmask16,
     a: __m256i,
-    sae: i32,
 ) -> __m512 {
+    static_assert_sae!(SAE);
     let a = a.as_i16x16();
     let src = src.as_f32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtph2ps(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtph2ps(a, src, k, SAE);
     transmute(r)
 }
 
@@ -14331,17 +14288,13 @@ pub unsafe fn _mm512_mask_cvt_roundph_ps(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_ps&expand=1334)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtph2ps, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvt_roundph_ps(k: __mmask16, a: __m256i, sae: i32) -> __m512 {
+#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvt_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256i) -> __m512 {
+    static_assert_sae!(SAE);
     let a = a.as_i16x16();
     let zero = _mm512_setzero_ps().as_f32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtph2ps(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtph2ps(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -14442,17 +14395,13 @@ pub unsafe fn _mm_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epi32&expand=1916)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvtt_roundps_epi32(a: __m512, sae: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2dq(a, zero, 0b11111111_11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2dq(a, zero, 0b11111111_11111111, SAE);
     transmute(r)
 }
 
@@ -14462,22 +14411,17 @@ pub unsafe fn _mm512_cvtt_roundps_epi32(a: __m512, sae: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epi32&expand=1917)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvtt_roundps_epi32(
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundps_epi32<const SAE: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
-    sae: i32,
 ) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let src = src.as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2dq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2dq(a, src, k, SAE);
     transmute(r)
 }
 
@@ -14487,17 +14431,13 @@ pub unsafe fn _mm512_mask_cvtt_roundps_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundps_epi32(k: __mmask16, a: __m512, sae: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundps_epi32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_i32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2dq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2dq(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -14507,17 +14447,13 @@ pub unsafe fn _mm512_maskz_cvtt_roundps_epi32(k: __mmask16, a: __m512, sae: i32)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epu32&expand=1922)   
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvtt_roundps_epu32(a: __m512, sae: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2udq(a, zero, 0b11111111_11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2udq(a, zero, 0b11111111_11111111, SAE);
     transmute(r)
 }
 
@@ -14527,22 +14463,17 @@ pub unsafe fn _mm512_cvtt_roundps_epu32(a: __m512, sae: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epu32&expand=1923)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvtt_roundps_epu32(
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundps_epu32<const SAE: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
-    sae: i32,
 ) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let src = src.as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2udq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2udq(a, src, k, SAE);
     transmute(r)
 }
 
@@ -14552,17 +14483,13 @@ pub unsafe fn _mm512_mask_cvtt_roundps_epu32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epu32&expand=1924)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundps_epu32(k: __mmask16, a: __m512, sae: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundps_epu32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    static_assert_sae!(SAE);
     let a = a.as_f32x16();
     let zero = _mm512_setzero_si512().as_u32x16();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttps2udq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttps2udq(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -14572,17 +14499,13 @@ pub unsafe fn _mm512_maskz_cvtt_roundps_epu32(k: __mmask16, a: __m512, sae: i32)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epi32&expand=1904)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvtt_roundpd_epi32(a: __m512d, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2dq(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2dq(a, zero, 0b11111111, SAE);
     transmute(r)
 }
 
@@ -14592,22 +14515,17 @@ pub unsafe fn _mm512_cvtt_roundpd_epi32(a: __m512d, sae: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epi32&expand=1905)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvtt_roundpd_epi32(
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundpd_epi32<const SAE: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
-    sae: i32,
 ) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let src = src.as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2dq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2dq(a, src, k, SAE);
     transmute(r)
 }
 
@@ -14617,17 +14535,13 @@ pub unsafe fn _mm512_mask_cvtt_roundpd_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32(k: __mmask8, a: __m512d, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2dq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2dq(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -14637,17 +14551,13 @@ pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32(k: __mmask8, a: __m512d, sae: i32)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epu32&expand=1910)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_cvtt_roundpd_epu32(a: __m512d, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_cvtt_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2udq(a, zero, 0b11111111, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2udq(a, zero, 0b11111111, SAE);
     transmute(r)
 }
 
@@ -14657,22 +14567,17 @@ pub unsafe fn _mm512_cvtt_roundpd_epu32(a: __m512d, sae: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epu32&expand=1911)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_cvtt_roundpd_epu32(
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_cvtt_roundpd_epu32<const SAE: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
-    sae: i32,
 ) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let src = src.as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2udq(a, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2udq(a, src, k, SAE);
     transmute(r)
 }
 
@@ -14896,17 +14801,13 @@ pub unsafe fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundpd_epu32&expand=1912)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundpd_epu32(k: __mmask8, a: __m512d, sae: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_cvtt_roundpd_epu32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    static_assert_sae!(SAE);
     let a = a.as_f64x8();
     let zero = _mm256_setzero_si256().as_i32x8();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvttpd2udq(a, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvttpd2udq(a, zero, k, SAE);
     transmute(r)
 }
 
@@ -34064,18 +33965,15 @@ pub unsafe fn _mm_maskz_div_round_sd<const ROUNDING: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxss(a, b, zero, 0b1, SAE);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34084,24 +33982,20 @@ pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_max_round_ss(
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_max_round_ss<const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    sae: i32,
 ) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxss(a, b, src, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34110,18 +34004,15 @@ pub unsafe fn _mm_mask_max_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxss(a, b, zero, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -34130,18 +34021,15 @@ pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxsd(a, b, zero, 0b1, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34150,24 +34038,20 @@ pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_max_round_sd(
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_max_round_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    sae: i32,
 ) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxsd(a, b, src, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34176,18 +34060,19 @@ pub unsafe fn _mm_mask_max_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_max_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vmaxsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vmaxsd(a, b, zero, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34196,18 +34081,15 @@ pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminss(a, b, zero, 0b1, SAE);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34216,24 +34098,20 @@ pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_min_round_ss(
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_min_round_ss<const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    sae: i32,
 ) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminss(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminss(a, b, src, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34242,18 +34120,15 @@ pub unsafe fn _mm_mask_min_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminss(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminss(a, b, zero, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
@@ -34262,18 +34137,15 @@ pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminsd(a, b, zero, 0b1, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34282,24 +34154,20 @@ pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_min_round_sd(
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_min_round_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    sae: i32,
 ) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminsd(a, b, src, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminsd(a, b, src, k, SAE);
+    transmute(r)
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -34308,18 +34176,19 @@ pub unsafe fn _mm_mask_min_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_min_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vminsd(a, b, zero, k, $imm4)
-        };
-    }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = vminsd(a, b, zero, k, SAE);
+    transmute(r)
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -34484,18 +34353,14 @@ pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpss(a, b, zero, 0b1, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpss(a, b, zero, 0b1, SAE);
     transmute(r)
 }
 
@@ -34505,24 +34370,19 @@ pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_getexp_round_ss(
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_getexp_round_ss<const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    sae: i32,
 ) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let src = src.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpss(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpss(a, b, src, k, SAE);
     transmute(r)
 }
 
@@ -34532,18 +34392,18 @@ pub unsafe fn _mm_mask_getexp_round_ss(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_getexp_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_getexp_round_ss<const SAE: i32>(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+) -> __m128 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let b = b.as_f32x4();
     let zero = _mm_setzero_ps().as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpss(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpss(a, b, zero, k, SAE);
     transmute(r)
 }
 
@@ -34553,18 +34413,14 @@ pub unsafe fn _mm_maskz_getexp_round_ss(k: __mmask8, a: __m128, b: __m128, sae:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpsd(a, b, zero, 0b1, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpsd(a, b, zero, 0b1, SAE);
     transmute(r)
 }
 
@@ -34574,24 +34430,19 @@ pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_getexp_round_sd(
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_getexp_round_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    sae: i32,
 ) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpsd(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpsd(a, b, src, k, SAE);
     transmute(r)
 }
 
@@ -34601,18 +34452,18 @@ pub unsafe fn _mm_mask_getexp_round_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_getexp_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_getexp_round_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f64x2();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vgetexpsd(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vgetexpsd(a, b, zero, k, SAE);
     transmute(r)
 }
 
@@ -36577,21 +36428,14 @@ pub unsafe fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_sd&expand=1371)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2sd, sae = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundss_sd(a: __m128d, b: __m128, sae: i32) -> __m128d {
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2sd(
-                a.as_f64x2(),
-                b.as_f32x4(),
-                _mm_setzero_pd().as_f64x2(),
-                0b11111111,
-                $imm4,
-            )
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundss_sd<const SAE: i32>(a: __m128d, b: __m128) -> __m128d {
+    static_assert_sae!(SAE);
+    let a = a.as_f64x2();
+    let b = b.as_f32x4();
+    let zero = _mm_setzero_pd().as_f64x2();
+    let r = vcvtss2sd(a, b, zero, 0b11111111, SAE);
     transmute(r)
 }
 
@@ -36601,24 +36445,19 @@ pub unsafe fn _mm_cvt_roundss_sd(a: __m128d, b: __m128, sae: i32) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundss_sd&expand=1372)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2sd, sae = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_cvt_roundss_sd(
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_cvt_roundss_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128,
-    sae: i32,
 ) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f32x4();
     let src = src.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2sd(a, b, src, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2sd(a, b, src, k, SAE);
     transmute(r)
 }
 
@@ -36628,18 +36467,18 @@ pub unsafe fn _mm_mask_cvt_roundss_sd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvt_roundss_sd&expand=1373)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2sd, sae = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_cvt_roundss_sd(k: __mmask8, a: __m128d, b: __m128, sae: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_cvt_roundss_sd<const SAE: i32>(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128,
+) -> __m128d {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let b = b.as_f32x4();
     let zero = _mm_setzero_pd().as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2sd(a, b, zero, k, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2sd(a, b, zero, k, SAE);
     transmute(r)
 }
 
@@ -36970,16 +36809,12 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_Si32&expand=1936)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_si32(a: __m128, sae: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2si(a, SAE);
     transmute(r)
 }
 
@@ -36989,16 +36824,12 @@ pub unsafe fn _mm_cvtt_roundss_si32(a: __m128, sae: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_i32&expand=1934)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_i32(a: __m128, sae: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2si(a, SAE);
     transmute(r)
 }
 
@@ -37008,16 +36839,12 @@ pub unsafe fn _mm_cvtt_roundss_i32(a: __m128, sae: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_u32&expand=1938)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_u32(a: __m128, sae: i32) -> u32 {
+#[cfg_attr(test, assert_instr(vcvtss2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2usi(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2usi(a, SAE);
     transmute(r)
 }
 
@@ -37047,16 +36874,12 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si32&expand=1930)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_si32(a: __m128d, sae: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2si(a, SAE);
     transmute(r)
 }
 
@@ -37066,16 +36889,12 @@ pub unsafe fn _mm_cvtt_roundsd_si32(a: __m128d, sae: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i32&expand=1928)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d, sae: i32) -> i32 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2si(a, SAE);
     transmute(r)
 }
 
@@ -37085,16 +36904,12 @@ pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d, sae: i32) -> i32 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundsd_u32&expand=1932)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_u32(a: __m128d, sae: i32) -> u32 {
+#[cfg_attr(test, assert_instr(vcvtsd2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2usi(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2usi(a, SAE);
     transmute(r)
 }
 
@@ -44170,7 +43985,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundps_ph() {
         let a = _mm512_set1_ps(1.);
-        let r = _mm512_cvt_roundps_ph(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a);
         let e = _mm256_setr_epi64x(
             4323521613979991040,
             4323521613979991040,
@@ -44184,9 +43999,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundps_ph() {
         let a = _mm512_set1_ps(1.);
         let src = _mm256_set1_epi16(0);
-        let r = _mm512_mask_cvt_roundps_ph(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvt_roundps_ph(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -44194,9 +44009,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundps_ph() {
         let a = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_cvt_roundps_ph(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvt_roundps_ph(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -44246,7 +44061,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtps_ph() {
         let a = _mm512_set1_ps(1.);
-        let r = _mm512_cvtps_ph(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a);
         let e = _mm256_setr_epi64x(
             4323521613979991040,
             4323521613979991040,
@@ -44260,9 +44075,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvtps_ph() {
         let a = _mm512_set1_ps(1.);
         let src = _mm256_set1_epi16(0);
-        let r = _mm512_mask_cvtps_ph(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtps_ph(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -44270,9 +44085,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvtps_ph() {
         let a = _mm512_set1_ps(1.);
-        let r = _mm512_maskz_cvtps_ph(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtps_ph(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -44327,7 +44142,7 @@ mod tests {
             4323521613979991040,
             4323521613979991040,
         );
-        let r = _mm512_cvt_roundph_ps(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a);
         let e = _mm512_set1_ps(1.);
         assert_eq_m512(r, e);
     }
@@ -44341,9 +44156,9 @@ mod tests {
             4323521613979991040,
         );
         let src = _mm512_set1_ps(0.);
-        let r = _mm512_mask_cvt_roundph_ps(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m512(r, src);
-        let r = _mm512_mask_cvt_roundph_ps(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
         let e = _mm512_setr_ps(
             1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         );
@@ -44358,9 +44173,9 @@ mod tests {
             4323521613979991040,
             4323521613979991040,
         );
-        let r = _mm512_maskz_cvt_roundph_ps(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_cvt_roundph_ps(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
         let e = _mm512_setr_ps(
             1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         );
@@ -44462,7 +44277,7 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_cvtt_roundps_epi32(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a);
         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
@@ -44473,9 +44288,9 @@ mod tests {
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
         let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundps_epi32(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtt_roundps_epi32(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -44485,9 +44300,9 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_maskz_cvtt_roundps_epi32(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtt_roundps_epi32(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -44497,7 +44312,7 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_cvtt_roundps_epu32(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a);
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
         assert_eq_m512i(r, e);
     }
@@ -44508,9 +44323,9 @@ mod tests {
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
         let src = _mm512_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundps_epu32(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_cvtt_roundps_epu32(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -44520,9 +44335,9 @@ mod tests {
         let a = _mm512_setr_ps(
             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
         );
-        let r = _mm512_maskz_cvtt_roundps_epu32(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_cvtt_roundps_epu32(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -52598,7 +52413,7 @@ mod tests {
     unsafe fn test_mm_max_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_max_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_ps(0., 1., 2., 7.);
         assert_eq_m128(r, e);
     }
@@ -52607,10 +52422,10 @@ mod tests {
     unsafe fn test_mm_mask_max_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_mask_max_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_ps(0., 1., 2., 3.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_max_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_ps(0., 1., 2., 7.);
         assert_eq_m128(r, e);
     }
@@ -52619,10 +52434,10 @@ mod tests {
     unsafe fn test_mm_maskz_max_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_maskz_max_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_ps(0., 1., 2., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_max_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_ps(0., 1., 2., 7.);
         assert_eq_m128(r, e);
     }
@@ -52631,7 +52446,7 @@ mod tests {
     unsafe fn test_mm_max_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_max_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_pd(0., 3.);
         assert_eq_m128d(r, e);
     }
@@ -52640,10 +52455,10 @@ mod tests {
     unsafe fn test_mm_mask_max_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_mask_max_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_pd(0., 1.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_max_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_pd(0., 3.);
         assert_eq_m128d(r, e);
     }
@@ -52652,10 +52467,10 @@ mod tests {
     unsafe fn test_mm_maskz_max_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_maskz_max_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_pd(0., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_max_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_pd(0., 3.);
         assert_eq_m128d(r, e);
     }
@@ -52664,7 +52479,7 @@ mod tests {
     unsafe fn test_mm_min_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_min_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_ps(0., 1., 2., 3.);
         assert_eq_m128(r, e);
     }
@@ -52673,10 +52488,10 @@ mod tests {
     unsafe fn test_mm_mask_min_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_mask_min_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_ps(0., 1., 2., 3.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_min_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_ps(0., 1., 2., 3.);
         assert_eq_m128(r, e);
     }
@@ -52685,10 +52500,10 @@ mod tests {
     unsafe fn test_mm_maskz_min_round_ss() {
         let a = _mm_set_ps(0., 1., 2., 3.);
         let b = _mm_set_ps(4., 5., 6., 7.);
-        let r = _mm_maskz_min_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_ps(0., 1., 2., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_min_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_ps(0., 1., 2., 3.);
         assert_eq_m128(r, e);
     }
@@ -52697,7 +52512,7 @@ mod tests {
     unsafe fn test_mm_min_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_min_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_pd(0., 1.);
         assert_eq_m128d(r, e);
     }
@@ -52706,10 +52521,10 @@ mod tests {
     unsafe fn test_mm_mask_min_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_mask_min_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_pd(0., 1.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_min_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_pd(0., 1.);
         assert_eq_m128d(r, e);
     }
@@ -52718,10 +52533,10 @@ mod tests {
     unsafe fn test_mm_maskz_min_round_sd() {
         let a = _mm_set_pd(0., 1.);
         let b = _mm_set_pd(2., 3.);
-        let r = _mm_maskz_min_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_pd(0., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_min_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_pd(0., 1.);
         assert_eq_m128d(r, e);
     }
@@ -52804,7 +52619,7 @@ mod tests {
     unsafe fn test_mm_getexp_round_ss() {
         let a = _mm_set1_ps(2.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_getexp_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_ps(2., 2., 2., 1.);
         assert_eq_m128(r, e);
     }
@@ -52813,10 +52628,10 @@ mod tests {
     unsafe fn test_mm_mask_getexp_round_ss() {
         let a = _mm_set1_ps(2.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_mask_getexp_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_ps(2., 2., 2., 2.);
         assert_eq_m128(r, e);
-        let r = _mm_mask_getexp_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_ps(2., 2., 2., 1.);
         assert_eq_m128(r, e);
     }
@@ -52825,10 +52640,10 @@ mod tests {
     unsafe fn test_mm_maskz_getexp_round_ss() {
         let a = _mm_set1_ps(2.);
         let b = _mm_set1_ps(3.);
-        let r = _mm_maskz_getexp_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_ps(2., 2., 2., 0.);
         assert_eq_m128(r, e);
-        let r = _mm_maskz_getexp_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_ps(2., 2., 2., 1.);
         assert_eq_m128(r, e);
     }
@@ -52837,7 +52652,7 @@ mod tests {
     unsafe fn test_mm_getexp_round_sd() {
         let a = _mm_set1_pd(2.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_getexp_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_pd(2., 1.);
         assert_eq_m128d(r, e);
     }
@@ -52846,10 +52661,10 @@ mod tests {
     unsafe fn test_mm_mask_getexp_round_sd() {
         let a = _mm_set1_pd(2.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_mask_getexp_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         let e = _mm_set_pd(2., 2.);
         assert_eq_m128d(r, e);
-        let r = _mm_mask_getexp_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_pd(2., 1.);
         assert_eq_m128d(r, e);
     }
@@ -52858,10 +52673,10 @@ mod tests {
     unsafe fn test_mm_maskz_getexp_round_sd() {
         let a = _mm_set1_pd(2.);
         let b = _mm_set1_pd(3.);
-        let r = _mm_maskz_getexp_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_pd(2., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_getexp_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_pd(2., 1.);
         assert_eq_m128d(r, e);
     }
@@ -53804,7 +53619,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundss_sd() {
         let a = _mm_set_pd(6., -7.5);
         let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
         let e = _mm_set_pd(6., -1.5);
         assert_eq_m128d(r, e);
     }
@@ -53813,9 +53628,9 @@ mod tests {
     unsafe fn test_mm_mask_cvt_roundss_sd() {
         let a = _mm_set_pd(6., -7.5);
         let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_mask_cvt_roundss_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
         assert_eq_m128d(r, a);
-        let r = _mm_mask_cvt_roundss_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
         let e = _mm_set_pd(6., -1.5);
         assert_eq_m128d(r, e);
     }
@@ -53824,10 +53639,10 @@ mod tests {
     unsafe fn test_mm_maskz_cvt_roundss_sd() {
         let a = _mm_set_pd(6., -7.5);
         let b = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_maskz_cvt_roundss_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
         let e = _mm_set_pd(6., 0.);
         assert_eq_m128d(r, e);
-        let r = _mm_maskz_cvt_roundss_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
         let e = _mm_set_pd(6., -1.5);
         assert_eq_m128d(r, e);
     }
@@ -53996,7 +53811,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_si32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_si32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_si32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i32 = -2;
         assert_eq!(r, e);
     }
@@ -54004,7 +53819,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_i32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_i32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_i32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i32 = -2;
         assert_eq!(r, e);
     }
@@ -54012,7 +53827,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_u32() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_u32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_u32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: u32 = u32::MAX;
         assert_eq!(r, e);
     }
@@ -54036,7 +53851,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_si32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_si32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_si32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i32 = -2;
         assert_eq!(r, e);
     }
@@ -54044,7 +53859,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_i32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_i32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_i32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i32 = -2;
         assert_eq!(r, e);
     }
@@ -54052,7 +53867,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_u32() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_u32(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_u32::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: u32 = u32::MAX;
         assert_eq!(r, e);
     }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index ae6202bc73..2db8a430d4 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -5090,7 +5090,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvtt_roundpd_epi32(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(a);
         let e = _mm256_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7);
         assert_eq_m256i(r, e);
     }
@@ -5099,9 +5099,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvtt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm256_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundpd_epi32(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtt_roundpd_epi32(src, 0b00001111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -5109,9 +5109,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvtt_roundpd_epi32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvtt_roundpd_epi32(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtt_roundpd_epi32(0b00001111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundpd_epi32::<_MM_FROUND_NO_EXC>(0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -5119,7 +5119,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvtt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvtt_roundpd_epu32(a, _MM_FROUND_NO_EXC);
+        let r = _mm512_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1);
         assert_eq_m256i(r, e);
     }
@@ -5128,9 +5128,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvtt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm256_set1_epi32(0);
-        let r = _mm512_mask_cvtt_roundpd_epu32(src, 0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
         assert_eq_m256i(r, src);
-        let r = _mm512_mask_cvtt_roundpd_epu32(src, 0b00001111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_mask_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -5138,9 +5138,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvtt_roundpd_epu32() {
         let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvtt_roundpd_epu32(0, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm512_maskz_cvtt_roundpd_epu32(0b00001111, a, _MM_FROUND_NO_EXC);
+        let r = _mm512_maskz_cvtt_roundpd_epu32::<_MM_FROUND_NO_EXC>(0b00001111, a);
         let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -6259,7 +6259,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cvt_roundps_pd() {
         let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_cvt_roundps_pd(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(a);
         let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         assert_eq_m512d(r, e);
     }
@@ -6268,9 +6268,9 @@ mod tests {
     unsafe fn test_mm512_mask_cvt_roundps_pd() {
         let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
         let src = _mm512_set1_pd(0.);
-        let r = _mm512_mask_cvt_roundps_pd(src, 0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(src, 0, a);
         assert_eq_m512d(r, src);
-        let r = _mm512_mask_cvt_roundps_pd(src, 0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(src, 0b00001111, a);
         let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
         assert_eq_m512d(r, e);
     }
@@ -6278,9 +6278,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_cvt_roundps_pd() {
         let a = _mm256_setr_ps(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5);
-        let r = _mm512_maskz_cvt_roundps_pd(0, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(0, a);
         assert_eq_m512d(r, _mm512_setzero_pd());
-        let r = _mm512_maskz_cvt_roundps_pd(0b00001111, a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_maskz_cvt_roundps_pd::<_MM_FROUND_CUR_DIRECTION>(0b00001111, a);
         let e = _mm512_setr_pd(0., -1.5, 2., -3.5, 0., 0., 0., 0.);
         assert_eq_m512d(r, e);
     }

From 3bb7aeff4271aefc8fd48f006cb5696a225498d9 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 17:02:33 +0000
Subject: [PATCH 020/123] shuffle_epi32

---
 crates/core_arch/src/x86/avx512f.rs | 137 +++++++++-------------------
 1 file changed, 44 insertions(+), 93 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index bcd826d700..c50bd73360 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -21523,75 +21523,32 @@ pub unsafe fn _mm_mask2_permutex2var_pd(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_epi32&expand=5150)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] //should be vpshufd, but generate vpermilps
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_shuffle_epi32(a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
-    let imm8 = (imm8 & 0xFF) as u8;
-    let a = a.as_i32x16();
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                a,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
-                1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
-                2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
-                _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
-                1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
-                2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
-                _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
-                1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
-                2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
-                _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
-            }
-        };
-    }
-    let r: i32x16 = match imm8 & 0x3 {
-        0 => shuffle1!(0, 4, 8, 12),
-        1 => shuffle1!(1, 5, 9, 13),
-        2 => shuffle1!(2, 6, 10, 14),
-        _ => shuffle1!(3, 7, 11, 15),
-    };
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 9))] //should be vpshufd
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r: i32x16 = simd_shuffle16(
+        a.as_i32x16(),
+        a.as_i32x16(),
+        [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            (MASK as u32 >> 4) & 0b11,
+            (MASK as u32 >> 6) & 0b11,
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
+            (MASK as u32 & 0b11) + 8,
+            ((MASK as u32 >> 2) & 0b11) + 8,
+            ((MASK as u32 >> 4) & 0b11) + 8,
+            ((MASK as u32 >> 6) & 0b11) + 8,
+            (MASK as u32 & 0b11) + 12,
+            ((MASK as u32 >> 2) & 0b11) + 12,
+            ((MASK as u32 >> 4) & 0b11) + 12,
+            ((MASK as u32 >> 6) & 0b11) + 12,
+        ],
+    );
     transmute(r)
 }
 
@@ -21600,20 +21557,15 @@ pub unsafe fn _mm512_shuffle_epi32(a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5148)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_shuffle_epi32(
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
-    imm8: _MM_PERM_ENUM,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_epi32(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_epi32::<MASK>(a);
     transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
 }
 
@@ -21622,15 +21574,14 @@ pub unsafe fn _mm512_mask_shuffle_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5149)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_shuffle_epi32(k: __mmask16, a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_epi32(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_epi32::<MASK>(a);
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
 }
@@ -47705,7 +47656,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_shuffle_epi32() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm512_shuffle_epi32(a, _MM_PERM_AADD);
+        let r = _mm512_shuffle_epi32::<_MM_PERM_AADD>(a);
         let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
         assert_eq_m512i(r, e);
     }
@@ -47713,9 +47664,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_shuffle_epi32() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm512_mask_shuffle_epi32(a, 0, a, _MM_PERM_AADD);
+        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_epi32(a, 0b11111111_11111111, a, _MM_PERM_AADD);
+        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111_11111111, a);
         let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
         assert_eq_m512i(r, e);
     }
@@ -47723,9 +47674,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_shuffle_epi32() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm512_maskz_shuffle_epi32(0, a, _MM_PERM_AADD);
+        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_epi32(0b00000000_11111111, a, _MM_PERM_AADD);
+        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00000000_11111111, a);
         let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }

From e4a915d4c5e07683e968d716c34274e45d06761a Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 18:07:40 +0000
Subject: [PATCH 021/123] mm256_shuffle_epi32

---
 crates/core_arch/src/x86/avx2.rs    | 85 ++++++-----------------------
 crates/core_arch/src/x86/avx512f.rs | 40 ++++++--------
 2 files changed, 35 insertions(+), 90 deletions(-)

diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index e1fa8bc9b9..ae15fc6db6 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -2642,74 +2642,25 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
-#[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
-#[rustc_args_required_const(1)]
+#[cfg_attr(test, assert_instr(vpermilps, MASK = 9))]
+#[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
-    // simd_shuffleX requires that its selector parameter be made up of
-    // constant values, but we can't enforce that here. In spirit, we need
-    // to write a `match` on all possible values of a byte, and for each value,
-    // hard-code the correct `simd_shuffleX` call using only constants. We
-    // then hope for LLVM to do the rest.
-    //
-    // Of course, that's... awful. So we try to use macros to do it for us.
-    let imm8 = (imm8 & 0xFF) as u8;
-
-    let a = a.as_i32x8();
-    macro_rules! shuffle_done {
-        ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
-            simd_shuffle8(
-                a,
-                a,
-                [
-                    $x01,
-                    $x23,
-                    $x45,
-                    $x67,
-                    4 + $x01,
-                    4 + $x23,
-                    4 + $x45,
-                    4 + $x67,
-                ],
-            )
-        };
-    }
-    macro_rules! shuffle_x67 {
-        ($x01:expr, $x23:expr, $x45:expr) => {
-            match (imm8 >> 6) & 0b11 {
-                0b00 => shuffle_done!($x01, $x23, $x45, 0),
-                0b01 => shuffle_done!($x01, $x23, $x45, 1),
-                0b10 => shuffle_done!($x01, $x23, $x45, 2),
-                _ => shuffle_done!($x01, $x23, $x45, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x45 {
-        ($x01:expr, $x23:expr) => {
-            match (imm8 >> 4) & 0b11 {
-                0b00 => shuffle_x67!($x01, $x23, 0),
-                0b01 => shuffle_x67!($x01, $x23, 1),
-                0b10 => shuffle_x67!($x01, $x23, 2),
-                _ => shuffle_x67!($x01, $x23, 3),
-            }
-        };
-    }
-    macro_rules! shuffle_x23 {
-        ($x01:expr) => {
-            match (imm8 >> 2) & 0b11 {
-                0b00 => shuffle_x45!($x01, 0),
-                0b01 => shuffle_x45!($x01, 1),
-                0b10 => shuffle_x45!($x01, 2),
-                _ => shuffle_x45!($x01, 3),
-            }
-        };
-    }
-    let r: i32x8 = match imm8 & 0b11 {
-        0b00 => shuffle_x23!(0),
-        0b01 => shuffle_x23!(1),
-        0b10 => shuffle_x23!(2),
-        _ => shuffle_x23!(3),
-    };
+pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r: i32x8 = simd_shuffle8(
+        a.as_i32x8(),
+        a.as_i32x8(),
+        [
+            MASK as u32 & 0b11,
+            (MASK as u32 >> 2) & 0b11,
+            (MASK as u32 >> 4) & 0b11,
+            (MASK as u32 >> 6) & 0b11,
+            (MASK as u32 & 0b11) + 4,
+            ((MASK as u32 >> 2) & 0b11) + 4,
+            ((MASK as u32 >> 4) & 0b11) + 4,
+            ((MASK as u32 >> 6) & 0b11) + 4,
+        ],
+    );
     transmute(r)
 }
 
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index c50bd73360..befa3047c6 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -21591,20 +21591,15 @@ pub unsafe fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_epi32&expand=5145)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_shuffle_epi32(
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
-    imm8: _MM_PERM_ENUM,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_epi32(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_epi32::<MASK>(a);
     transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
 }
 
@@ -21613,15 +21608,14 @@ pub unsafe fn _mm256_mask_shuffle_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_epi32&expand=5146)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_shuffle_epi32(k: __mmask8, a: __m256i, imm8: _MM_PERM_ENUM) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_epi32(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_epi32::<MASK>(a);
     let zero = _mm256_setzero_si256().as_i32x8();
     transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
 }
@@ -47684,9 +47678,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_shuffle_epi32() {
         let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm256_mask_shuffle_epi32(a, 0, a, _MM_PERM_AADD);
+        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shuffle_epi32(a, 0b11111111, a, _MM_PERM_AADD);
+        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111, a);
         let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
         assert_eq_m256i(r, e);
     }
@@ -47694,9 +47688,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_shuffle_epi32() {
         let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
-        let r = _mm256_maskz_shuffle_epi32(0, a, _MM_PERM_AADD);
+        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shuffle_epi32(0b11111111, a, _MM_PERM_AADD);
+        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b11111111, a);
         let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
         assert_eq_m256i(r, e);
     }

From 743bd9d67494fb027faccd6cc2e654f90cb37fc0 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 20:48:07 +0000
Subject: [PATCH 022/123] mm512_srai_epi32

---
 crates/core_arch/src/x86/avx512f.rs | 100 ++++++++++++----------------
 crates/core_arch/src/x86/macros.rs  |  16 +++++
 2 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index befa3047c6..8f3c80e113 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18338,16 +18338,12 @@ pub unsafe fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srai_epi32&expand=5436)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_srai_epi32(a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsraid(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vpsraid(a, IMM8);
     transmute(r)
 }
 
@@ -18356,17 +18352,17 @@ pub unsafe fn _mm512_srai_epi32(a: __m512i, imm8: u32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srai_epi32&expand=5434)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_srai_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsraid(a, $imm8)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    let r = vpsraid(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18374,18 +18370,14 @@ pub unsafe fn _mm512_mask_srai_epi32(src: __m512i, k: __mmask16, a: __m512i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srai_epi32&expand=5435)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_srai_epi32(k: __mmask16, a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsraid(a, $imm8)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+    let r = vpsraid(a, IMM8);
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, shf, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21625,20 +21617,15 @@ pub unsafe fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_epi32&expand=5142)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_shuffle_epi32(
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
-    imm8: _MM_PERM_ENUM,
 ) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_shuffle_epi32::<$imm8>(a)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm_shuffle_epi32::<MASK>(a);
     transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
 }
 
@@ -21647,15 +21634,14 @@ pub unsafe fn _mm_mask_shuffle_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_epi32&expand=5143)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_shuffle_epi32(k: __mmask8, a: __m128i, imm8: _MM_PERM_ENUM) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_shuffle_epi32::<$imm8>(a)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(MASK);
+    let r = _mm_shuffle_epi32::<MASK>(a);
     let zero = _mm_setzero_si128().as_i32x4();
     transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
 }
@@ -46913,7 +46899,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_srai_epi32() {
         let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -15);
-        let r = _mm512_srai_epi32(a, 2);
+        let r = _mm512_srai_epi32::<2>(a);
         let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4);
         assert_eq_m512i(r, e);
     }
@@ -46921,9 +46907,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_srai_epi32() {
         let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
-        let r = _mm512_mask_srai_epi32(a, 0, a, 2);
+        let r = _mm512_mask_srai_epi32::<2>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_srai_epi32(a, 0b11111111_11111111, a, 2);
+        let r = _mm512_mask_srai_epi32::<2>(a, 0b11111111_11111111, a);
         let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
         assert_eq_m512i(r, e);
     }
@@ -46931,9 +46917,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_srai_epi32() {
         let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
-        let r = _mm512_maskz_srai_epi32(0, a, 2);
+        let r = _mm512_maskz_srai_epi32::<2>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srai_epi32(0b00000000_11111111, a, 2);
+        let r = _mm512_maskz_srai_epi32::<2>(0b00000000_11111111, a);
         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
         assert_eq_m512i(r, e);
     }
@@ -47698,9 +47684,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_shuffle_epi32() {
         let a = _mm_set_epi32(1, 4, 5, 8);
-        let r = _mm_mask_shuffle_epi32(a, 0, a, _MM_PERM_AADD);
+        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shuffle_epi32(a, 0b00001111, a, _MM_PERM_AADD);
+        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b00001111, a);
         let e = _mm_set_epi32(8, 8, 1, 1);
         assert_eq_m128i(r, e);
     }
@@ -47708,9 +47694,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_shuffle_epi32() {
         let a = _mm_set_epi32(1, 4, 5, 8);
-        let r = _mm_maskz_shuffle_epi32(0, a, _MM_PERM_AADD);
+        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shuffle_epi32(0b00001111, a, _MM_PERM_AADD);
+        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00001111, a);
         let e = _mm_set_epi32(8, 8, 1, 1);
         assert_eq_m128i(r, e);
     }
diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index e659ac3da8..ecb7085d18 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -32,6 +32,22 @@ macro_rules! static_assert_sae {
     };
 }
 
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// out of `bits`-bit range.
+pub(crate) struct ValidateConstImmU<const IMM: u32, const BITS: i32>;
+impl<const IMM: u32, const BITS: i32> ValidateConstImmU<IMM, BITS> {
+    pub(crate) const VALID: () = {
+        let _ = 1 / ((IMM < (1 << BITS)) as usize);
+    };
+}
+
+#[allow(unused)]
+macro_rules! static_assert_imm8u {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86::macros::ValidateConstImmU::<$imm, 8>::VALID;
+    };
+}
+
 macro_rules! constify_imm6 {
     ($imm8:expr, $expand:ident) => {
         #[allow(overflowing_literals)]

From ff63fc18756fa4de519ebeacf7f1ea78a3fcb934 Mon Sep 17 00:00:00 2001
From: Lzu Tao <taolzu@gmail.com>
Date: Sun, 28 Feb 2021 12:11:20 +0700
Subject: [PATCH 023/123] Add `static_assert_imm{1,5,6}` macros

---
 crates/core_arch/src/macros.rs | 44 ++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/crates/core_arch/src/macros.rs b/crates/core_arch/src/macros.rs
index bc43f039b7..87e49fba4b 100644
--- a/crates/core_arch/src/macros.rs
+++ b/crates/core_arch/src/macros.rs
@@ -1,46 +1,60 @@
 //! Utility macros.
 
-// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
-// out of `bits`-bit range.
-pub(crate) struct ValidateConstImm<const IMM: i32, const BITS: i32>;
-impl<const IMM: i32, const BITS: i32> ValidateConstImm<IMM, BITS> {
+// Helper struct used to trigger const eval errors when the const generic immediate value `IMM` is
+// out of `[MIN-MAX]` range.
+pub(crate) struct ValidateConstImm<const IMM: i32, const MIN: i32, const MAX: i32>;
+impl<const IMM: i32, const MIN: i32, const MAX: i32> ValidateConstImm<IMM, MIN, MAX> {
     pub(crate) const VALID: () = {
-        let _ = 1 / ((IMM >= 0 && IMM < (1 << BITS)) as usize);
+        let _ = 1 / ((IMM >= MIN && IMM <= MAX) as usize);
     };
 }
 
-#[allow(unused)]
+#[allow(unused_macros)]
 macro_rules! static_assert_imm1 {
     ($imm:ident) => {
-        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 1>::VALID;
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 1) - 1 }>::VALID;
     };
 }
 
-#[allow(unused)]
+#[allow(unused_macros)]
 macro_rules! static_assert_imm2 {
     ($imm:ident) => {
-        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 2>::VALID;
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 2) - 1 }>::VALID;
     };
 }
 
-#[allow(unused)]
+#[allow(unused_macros)]
 macro_rules! static_assert_imm3 {
     ($imm:ident) => {
-        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 3>::VALID;
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 3) - 1 }>::VALID;
     };
 }
 
-#[allow(unused)]
+#[allow(unused_macros)]
 macro_rules! static_assert_imm4 {
     ($imm:ident) => {
-        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 4>::VALID;
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 4) - 1 }>::VALID;
     };
 }
 
-#[allow(unused)]
+#[allow(unused_macros)]
+macro_rules! static_assert_imm5 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 5) - 1 }>::VALID;
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm6 {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 6) - 1 }>::VALID;
+    };
+}
+
+#[allow(unused_macros)]
 macro_rules! static_assert_imm8 {
     ($imm:ident) => {
-        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 8>::VALID;
+        let _ = $crate::core_arch::macros::ValidateConstImm::<$imm, 0, { (1 << 8) - 1 }>::VALID;
     };
 }
 

From 374060ebcd03186f673c013f7ce2f03f89e8e1c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Mon, 1 Mar 2021 20:55:40 +0100
Subject: [PATCH 024/123] Convert `_mm_cmp_pd` to const generics and fix imm
 width

---
 crates/core_arch/src/x86/avx.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 559d6279b2..315cd18efd 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -792,16 +792,12 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_pd)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
-#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
-#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vcmpeqpd, IMM8 = 0))] // TODO Validate vcmppd
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            vcmppd(a, b, $imm8)
-        };
-    }
-    constify_imm6!(imm8, call)
+pub unsafe fn _mm_cmp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm5!(IMM8);
+    vcmppd(a, b, IMM8 as i8)
 }
 
 /// Compares packed double-precision (64-bit) floating-point
@@ -3635,7 +3631,7 @@ mod tests {
     unsafe fn test_mm_cmp_pd() {
         let a = _mm_setr_pd(4., 9.);
         let b = _mm_setr_pd(4., 3.);
-        let r = _mm_cmp_pd(a, b, _CMP_GE_OS);
+        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
         assert!(get_m128d(r, 0).is_nan());
         assert!(get_m128d(r, 1).is_nan());
     }

From 105064303230d3ea627164d9975bf4fd5a9c20f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Mon, 1 Mar 2021 20:59:48 +0100
Subject: [PATCH 025/123] Convert `_mm256_cmp_pd` to const generics and fix imm
 width

---
 crates/core_arch/src/x86/avx.rs  | 16 ++++++----------
 crates/core_arch/src/x86/test.rs |  2 +-
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 315cd18efd..0b520fc6f0 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -807,16 +807,12 @@ pub unsafe fn _mm_cmp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_pd)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
-#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vcmpeqpd, IMM8 = 0))] // TODO Validate vcmppd
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            vcmppd256(a, b, $imm8)
-        };
-    }
-    constify_imm6!(imm8, call)
+pub unsafe fn _mm256_cmp_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_imm5!(IMM8);
+    vcmppd256(a, b, IMM8 as u8)
 }
 
 /// Compares packed single-precision (32-bit) floating-point
@@ -3640,7 +3636,7 @@ mod tests {
     unsafe fn test_mm256_cmp_pd() {
         let a = _mm256_setr_pd(1., 2., 3., 4.);
         let b = _mm256_setr_pd(5., 6., 7., 8.);
-        let r = _mm256_cmp_pd(a, b, _CMP_GE_OS);
+        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
         let e = _mm256_set1_pd(0.);
         assert_eq_m256d(r, e);
     }
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index c1f974133c..d08052df32 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -47,7 +47,7 @@ pub unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
 
 #[target_feature(enable = "avx")]
 pub unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
-    let cmp = _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
+    let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
     if _mm256_movemask_pd(cmp) != 0b1111 {
         panic!("{:?} != {:?}", a, b);
     }

From 5cb5d50889cac474bf50b8627cdea1d2ed660eb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Mon, 1 Mar 2021 21:03:55 +0100
Subject: [PATCH 026/123] Convert `_mm_cmp_ps` to const generics and fix imm
 width

---
 crates/core_arch/src/x86/avx.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 0b520fc6f0..5fc097169b 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -822,16 +822,12 @@ pub unsafe fn _mm256_cmp_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ps)
 #[inline]
 #[target_feature(enable = "avx,sse")]
-#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
-#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vcmpeqps, IMM8 = 0))] // TODO Validate vcmpps
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            vcmpps(a, b, $imm8)
-        };
-    }
-    constify_imm6!(imm8, call)
+pub unsafe fn _mm_cmp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm5!(IMM8);
+    vcmpps(a, b, IMM8 as i8)
 }
 
 /// Compares packed single-precision (32-bit) floating-point
@@ -3645,7 +3641,7 @@ mod tests {
     unsafe fn test_mm_cmp_ps() {
         let a = _mm_setr_ps(4., 3., 2., 5.);
         let b = _mm_setr_ps(4., 9., 16., 25.);
-        let r = _mm_cmp_ps(a, b, _CMP_GE_OS);
+        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
         assert!(get_m128(r, 0).is_nan());
         assert_eq!(get_m128(r, 1), 0.);
         assert_eq!(get_m128(r, 2), 0.);

From 784e22e6649677274a4d4873f4453c6e8b2d53a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Mon, 1 Mar 2021 21:06:57 +0100
Subject: [PATCH 027/123] Convert `_mm256_cmp_ps` to const generics and fix imm
 width

---
 crates/core_arch/src/x86/avx.rs  | 16 ++++++----------
 crates/core_arch/src/x86/test.rs |  2 +-
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 5fc097169b..a690bc5ad6 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -837,16 +837,12 @@ pub unsafe fn _mm_cmp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ps)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
-#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vcmpeqps, IMM8 = 0))] // TODO Validate vcmpps
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            vcmpps256(a, b, $imm8)
-        };
-    }
-    constify_imm6!(imm8, call)
+pub unsafe fn _mm256_cmp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_imm5!(IMM8);
+    vcmpps256(a, b, IMM8 as u8)
 }
 
 /// Compares the lower double-precision (64-bit) floating-point element in
@@ -3652,7 +3648,7 @@ mod tests {
     unsafe fn test_mm256_cmp_ps() {
         let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
         let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
-        let r = _mm256_cmp_ps(a, b, _CMP_GE_OS);
+        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
         let e = _mm256_set1_ps(0.);
         assert_eq_m256(r, e);
     }
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index d08052df32..0784e37524 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -60,7 +60,7 @@ pub unsafe fn get_m256d(a: __m256d, idx: usize) -> f64 {
 
 #[target_feature(enable = "avx")]
 pub unsafe fn assert_eq_m256(a: __m256, b: __m256) {
-    let cmp = _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
+    let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
     if _mm256_movemask_ps(cmp) != 0b11111111 {
         panic!("{:?} != {:?}", a, b);
     }

From 06c707ff448a2612aad3823b8ee1b0b5c6815db3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Mon, 1 Mar 2021 21:09:16 +0100
Subject: [PATCH 028/123] Convert `_mm_cmp_sd` to const generics and fix imm
 width

---
 crates/core_arch/src/x86/avx.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index a690bc5ad6..8bfd907857 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -854,16 +854,12 @@ pub unsafe fn _mm256_cmp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
-#[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
-#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vcmpeqsd, IMM8 = 0))] // TODO Validate vcmpsd
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            vcmpsd(a, b, $imm8)
-        };
-    }
-    constify_imm6!(imm8, call)
+pub unsafe fn _mm_cmp_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm5!(IMM8);
+    vcmpsd(a, b, IMM8 as i8)
 }
 
 /// Compares the lower single-precision (32-bit) floating-point element in
@@ -3657,7 +3653,7 @@ mod tests {
     unsafe fn test_mm_cmp_sd() {
         let a = _mm_setr_pd(4., 9.);
         let b = _mm_setr_pd(4., 3.);
-        let r = _mm_cmp_sd(a, b, _CMP_GE_OS);
+        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
         assert!(get_m128d(r, 0).is_nan());
         assert_eq!(get_m128d(r, 1), 9.);
     }

From 42643cb5f54cbf2f5d8713a3c1f1dd24dbbac919 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Mon, 1 Mar 2021 21:13:40 +0100
Subject: [PATCH 029/123] Convert `_mm_cmp_ss` to const generics and fix imm
 width

---
 crates/core_arch/src/x86/avx.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 8bfd907857..98ba11fefa 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -871,16 +871,12 @@ pub unsafe fn _mm_cmp_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss)
 #[inline]
 #[target_feature(enable = "avx,sse")]
-#[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
-#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vcmpeqss, IMM8 = 0))] // TODO Validate vcmpss
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            vcmpss(a, b, $imm8)
-        };
-    }
-    constify_imm6!(imm8, call)
+pub unsafe fn _mm_cmp_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm5!(IMM8);
+    vcmpss(a, b, IMM8 as i8)
 }
 
 /// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
@@ -3662,7 +3658,7 @@ mod tests {
     unsafe fn test_mm_cmp_ss() {
         let a = _mm_setr_ps(4., 3., 2., 5.);
         let b = _mm_setr_ps(4., 9., 16., 25.);
-        let r = _mm_cmp_ss(a, b, _CMP_GE_OS);
+        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
         assert!(get_m128(r, 0).is_nan());
         assert_eq!(get_m128(r, 1), 3.);
         assert_eq!(get_m128(r, 2), 2.);

From 90fa961616a889ca704da2635fa244da4d2ab052 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Mon, 1 Mar 2021 21:23:57 +0100
Subject: [PATCH 030/123] Convert `_mm256_insert_epi8` to const generics

---
 crates/core_arch/src/x86/avx.rs  | 15 +++++----------
 crates/core_arch/src/x86/avx2.rs | 10 +++++-----
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 98ba11fefa..28e14fb758 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -1470,16 +1470,11 @@ pub unsafe fn _mm256_insertf128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m2
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
-#[rustc_args_required_const(2)]
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
-    let a = a.as_i8x32();
-    macro_rules! call {
-        ($index:expr) => {
-            simd_insert(a, $index, i)
-        };
-    }
-    transmute(constify_imm5!(index, call))
+pub unsafe fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
+    static_assert_imm5!(INDEX);
+    transmute(simd_insert(a.as_i8x32(), INDEX as u32, i))
 }
 
 /// Copies `a` to result, and inserts the 16-bit integer `i` into result
@@ -3931,7 +3926,7 @@ mod tests {
             17, 18, 19, 20, 21, 22, 23, 24,
             25, 26, 27, 28, 29, 30, 31, 32,
         );
-        let r = _mm256_insert_epi8(a, 0, 31);
+        let r = _mm256_insert_epi8::<31>(a, 0);
         #[rustfmt::skip]
         let e = _mm256_setr_epi8(
             1, 2, 3, 4, 5, 6, 7, 8,
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index ae15fc6db6..7fa1f1625e 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -4335,8 +4335,8 @@ mod tests {
     #[simd_test(enable = "avx2")]
     unsafe fn test_mm256_blendv_epi8() {
         let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
-        let mask = _mm256_insert_epi8(_mm256_set1_epi8(0), -1, 2);
-        let e = _mm256_insert_epi8(_mm256_set1_epi8(4), 2, 2);
+        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
+        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
         let r = _mm256_blendv_epi8(a, b, mask);
         assert_eq_m256i(r, e);
     }
@@ -4455,7 +4455,7 @@ mod tests {
             7, 6, 5, 4, 3, 2, 1, 0,
         );
         let r = _mm256_cmpeq_epi8(a, b);
-        assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
+        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
     }
 
     #[simd_test(enable = "avx2")]
@@ -4494,10 +4494,10 @@ mod tests {
 
     #[simd_test(enable = "avx2")]
     unsafe fn test_mm256_cmpgt_epi8() {
-        let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
+        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
         let b = _mm256_set1_epi8(0);
         let r = _mm256_cmpgt_epi8(a, b);
-        assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
+        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
     }
 
     #[simd_test(enable = "avx2")]

From 965341701f8b8976f977b1d82cbdc76972704b3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Mon, 1 Mar 2021 22:13:18 +0100
Subject: [PATCH 031/123] Convert `_mm256_extract_epi8` to const generics

---
 crates/core_arch/src/x86/avx2.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index 7fa1f1625e..62c678bffa 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -3694,16 +3694,11 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
-#[rustc_args_required_const(1)]
+#[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i32 {
-    let a = a.as_u8x32();
-    macro_rules! call {
-        ($imm5:expr) => {
-            simd_extract::<_, u8>(a, $imm5) as i32
-        };
-    }
-    constify_imm5!(imm8, call)
+pub unsafe fn _mm256_extract_epi8<const IMM8: i32>(a: __m256i) -> i32 {
+    static_assert_imm5!(IMM8);
+    simd_extract::<_, u8>(a.as_u8x32(), IMM8 as u32) as i32
 }
 
 /// Extracts a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
@@ -6071,8 +6066,8 @@ mod tests {
             16, 17, 18, 19, 20, 21, 22, 23,
             24, 25, 26, 27, 28, 29, 30, 31
         );
-        let r1 = _mm256_extract_epi8(a, 0);
-        let r2 = _mm256_extract_epi8(a, 35);
+        let r1 = _mm256_extract_epi8::<0>(a);
+        let r2 = _mm256_extract_epi8::<3>(a);
         assert_eq!(r1, 0xFF);
         assert_eq!(r2, 3);
     }

From 7f47cf864fe645da26ffe7cb25950d6d55f2e3e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Tue, 2 Mar 2021 19:39:06 +0100
Subject: [PATCH 032/123] remove unused x86 macros

---
 crates/core_arch/src/x86/macros.rs | 65 ------------------------------
 1 file changed, 65 deletions(-)

diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index ecb7085d18..bf734974af 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -48,71 +48,6 @@ macro_rules! static_assert_imm8u {
     };
 }
 
-macro_rules! constify_imm6 {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1_1111 {
-            0 => $expand!(0),
-            1 => $expand!(1),
-            2 => $expand!(2),
-            3 => $expand!(3),
-            4 => $expand!(4),
-            5 => $expand!(5),
-            6 => $expand!(6),
-            7 => $expand!(7),
-            8 => $expand!(8),
-            9 => $expand!(9),
-            10 => $expand!(10),
-            11 => $expand!(11),
-            12 => $expand!(12),
-            13 => $expand!(13),
-            14 => $expand!(14),
-            15 => $expand!(15),
-            16 => $expand!(16),
-            17 => $expand!(17),
-            18 => $expand!(18),
-            19 => $expand!(19),
-            20 => $expand!(20),
-            21 => $expand!(21),
-            22 => $expand!(22),
-            23 => $expand!(23),
-            24 => $expand!(24),
-            25 => $expand!(25),
-            26 => $expand!(26),
-            27 => $expand!(27),
-            28 => $expand!(28),
-            29 => $expand!(29),
-            30 => $expand!(30),
-            _ => $expand!(31),
-        }
-    };
-}
-
-#[allow(unused_macros)]
-macro_rules! constify_imm4 {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1111 {
-            0 => $expand!(0),
-            1 => $expand!(1),
-            2 => $expand!(2),
-            3 => $expand!(3),
-            4 => $expand!(4),
-            5 => $expand!(5),
-            6 => $expand!(6),
-            7 => $expand!(7),
-            8 => $expand!(8),
-            9 => $expand!(9),
-            10 => $expand!(10),
-            11 => $expand!(11),
-            12 => $expand!(12),
-            13 => $expand!(13),
-            14 => $expand!(14),
-            _ => $expand!(15),
-        }
-    };
-}
-
 macro_rules! constify_imm3 {
     ($imm8:expr, $expand:ident) => {
         #[allow(overflowing_literals)]

From 5653eed489138795a424d0028fa7720960b5bb8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Wed, 3 Mar 2021 02:38:26 +0100
Subject: [PATCH 033/123] make some const generic immediates better match their
 width or the intel intrinsics guide

---
 crates/core_arch/src/x86/avx.rs  | 60 ++++++++++++++++----------------
 crates/core_arch/src/x86/avx2.rs | 24 ++++++-------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 28e14fb758..53c4a00f42 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -787,66 +787,66 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 
 /// Compares packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
-/// specified by `imm8`.
+/// specified by `IMM5`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_pd)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
-#[cfg_attr(test, assert_instr(vcmpeqpd, IMM8 = 0))] // TODO Validate vcmppd
+#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_imm5!(IMM8);
-    vcmppd(a, b, IMM8 as i8)
+pub unsafe fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm5!(IMM5);
+    vcmppd(a, b, IMM5 as i8)
 }
 
 /// Compares packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
-/// specified by `imm8`.
+/// specified by `IMM5`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_pd)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vcmpeqpd, IMM8 = 0))] // TODO Validate vcmppd
+#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cmp_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
-    static_assert_imm5!(IMM8);
-    vcmppd256(a, b, IMM8 as u8)
+pub unsafe fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_imm5!(IMM5);
+    vcmppd256(a, b, IMM5 as u8)
 }
 
 /// Compares packed single-precision (32-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
-/// specified by `imm8`.
+/// specified by `IMM5`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ps)
 #[inline]
 #[target_feature(enable = "avx,sse")]
-#[cfg_attr(test, assert_instr(vcmpeqps, IMM8 = 0))] // TODO Validate vcmpps
+#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_imm5!(IMM8);
-    vcmpps(a, b, IMM8 as i8)
+pub unsafe fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm5!(IMM5);
+    vcmpps(a, b, IMM5 as i8)
 }
 
 /// Compares packed single-precision (32-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
-/// specified by `imm8`.
+/// specified by `IMM5`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ps)
 #[inline]
 #[target_feature(enable = "avx")]
-#[cfg_attr(test, assert_instr(vcmpeqps, IMM8 = 0))] // TODO Validate vcmpps
+#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_cmp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
-    static_assert_imm5!(IMM8);
-    vcmpps256(a, b, IMM8 as u8)
+pub unsafe fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_imm5!(IMM5);
+    vcmpps256(a, b, IMM5 as u8)
 }
 
 /// Compares the lower double-precision (64-bit) floating-point element in
-/// `a` and `b` based on the comparison operand specified by `imm8`,
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
 /// store the result in the lower element of returned vector,
 /// and copies the upper element from `a` to the upper element of returned
 /// vector.
@@ -854,16 +854,16 @@ pub unsafe fn _mm256_cmp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
-#[cfg_attr(test, assert_instr(vcmpeqsd, IMM8 = 0))] // TODO Validate vcmpsd
+#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_imm5!(IMM8);
-    vcmpsd(a, b, IMM8 as i8)
+pub unsafe fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_imm5!(IMM5);
+    vcmpsd(a, b, IMM5 as i8)
 }
 
 /// Compares the lower single-precision (32-bit) floating-point element in
-/// `a` and `b` based on the comparison operand specified by `imm8`,
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
 /// store the result in the lower element of returned vector,
 /// and copies the upper 3 packed elements from `a` to the upper elements of
 /// returned vector.
@@ -871,12 +871,12 @@ pub unsafe fn _mm_cmp_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss)
 #[inline]
 #[target_feature(enable = "avx,sse")]
-#[cfg_attr(test, assert_instr(vcmpeqss, IMM8 = 0))] // TODO Validate vcmpss
+#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
 #[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_cmp_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_imm5!(IMM8);
-    vcmpss(a, b, IMM8 as i8)
+pub unsafe fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+    static_assert_imm5!(IMM5);
+    vcmpss(a, b, IMM5 as i8)
 }
 
 /// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index 62c678bffa..c98c1d8005 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -3685,7 +3685,7 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
     transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
 }
 
-/// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
+/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
@@ -3696,12 +3696,12 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
 // This intrinsic has no corresponding instruction.
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extract_epi8<const IMM8: i32>(a: __m256i) -> i32 {
-    static_assert_imm5!(IMM8);
-    simd_extract::<_, u8>(a.as_u8x32(), IMM8 as u32) as i32
+pub unsafe fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_imm5!(INDEX);
+    simd_extract::<_, u8>(a.as_u8x32(), INDEX as u32) as i32
 }
 
-/// Extracts a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
+/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
@@ -3712,12 +3712,12 @@ pub unsafe fn _mm256_extract_epi8<const IMM8: i32>(a: __m256i) -> i32 {
 // This intrinsic has no corresponding instruction.
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extract_epi16<const IMM8: i32>(a: __m256i) -> i32 {
-    static_assert_imm4!(IMM8);
-    simd_extract::<_, u16>(a.as_u16x16(), IMM8 as u32) as i32
+pub unsafe fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_imm4!(INDEX);
+    simd_extract::<_, u16>(a.as_u16x16(), INDEX as u32) as i32
 }
 
-/// Extracts a 32-bit integer from `a`, selected with `imm8`.
+/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
 #[inline]
@@ -3725,9 +3725,9 @@ pub unsafe fn _mm256_extract_epi16<const IMM8: i32>(a: __m256i) -> i32 {
 // This intrinsic has no corresponding instruction.
 #[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extract_epi32<const IMM8: i32>(a: __m256i) -> i32 {
-    static_assert_imm3!(IMM8);
-    simd_extract(a.as_i32x8(), IMM8 as u32)
+pub unsafe fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_imm3!(INDEX);
+    simd_extract(a.as_i32x8(), INDEX as u32)
 }
 
 /// Returns the first element of the input vector of `[4 x double]`.

From 4ec00defa601d8dac651a8b13b93da8013c76405 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 21:01:18 +0000
Subject: [PATCH 034/123] fix macro

---
 crates/core_arch/src/x86/macros.rs | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index bf734974af..4f160221d6 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -32,22 +32,6 @@ macro_rules! static_assert_sae {
     };
 }
 
-// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
-// out of `bits`-bit range.
-pub(crate) struct ValidateConstImmU<const IMM: u32, const BITS: i32>;
-impl<const IMM: u32, const BITS: i32> ValidateConstImmU<IMM, BITS> {
-    pub(crate) const VALID: () = {
-        let _ = 1 / ((IMM < (1 << BITS)) as usize);
-    };
-}
-
-#[allow(unused)]
-macro_rules! static_assert_imm8u {
-    ($imm:ident) => {
-        let _ = $crate::core_arch::x86::macros::ValidateConstImmU::<$imm, 8>::VALID;
-    };
-}
-
 macro_rules! constify_imm3 {
     ($imm8:expr, $expand:ident) => {
         #[allow(overflowing_literals)]

From 01fb6247a16b92c445710730ffc38bbe5df05e8a Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 23:09:01 +0000
Subject: [PATCH 035/123] shuffle_i32x4

---
 crates/core_arch/src/x86/avx512f.rs | 223 +++++++++-------------------
 1 file changed, 72 insertions(+), 151 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 8f3c80e113..cdc3f2d003 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -21909,78 +21909,34 @@ pub unsafe fn _mm_maskz_shuffle_pd<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32&expand=5177)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10010101))] //should be vshufi32x4
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shuffle_i32x4(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10010101))] //should be vshufi32x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
     let a = a.as_i32x16();
     let b = b.as_i32x16();
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                b,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
-                1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
-                2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
-                _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
-                1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
-                2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
-                _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
-            }
-        };
-    }
-    let r: i32x16 = match imm8 & 0x3 {
-        0 => shuffle1!(0, 1, 2, 3),
-        1 => shuffle1!(4, 5, 6, 7),
-        2 => shuffle1!(8, 9, 10, 11),
-        _ => shuffle1!(12, 13, 14, 15),
-    };
-
+    let r: i32x16 = simd_shuffle16(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b11) * 4 + 0,
+            (MASK as u32 & 0b11) * 4 + 1,
+            (MASK as u32 & 0b11) * 4 + 2,
+            (MASK as u32 & 0b11) * 4 + 3,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+        ],
+    );
     transmute(r)
 }
 
@@ -21989,21 +21945,15 @@ pub unsafe fn _mm512_shuffle_i32x4(a: __m512i, b: __m512i, imm8: i32) -> __m512i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x&expand=5175)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10110101))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shuffle_i32x4(
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10110101))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_i32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm512_shuffle_i32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
 }
 
@@ -22012,20 +21962,14 @@ pub unsafe fn _mm512_mask_shuffle_i32x4(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32&expand=5176)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10110101))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shuffle_i32x4(
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10110101))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
     k: __mmask16,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_i32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm512_shuffle_i32x4::<MASK>(a, b);
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
 }
@@ -22035,39 +21979,26 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i32x4&expand=5174)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshufi32x4
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shuffle_i32x4(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b1001))] //should be vshufi32x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
     let a = a.as_i32x8();
     let b = b.as_i32x8();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr, $c: expr, $d: expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
-                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
-            }
-        };
-    }
-    let r: i32x8 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1, 2, 3),
-        _ => shuffle1!(4, 5, 6, 7),
-    };
+    let r: i32x8 = simd_shuffle8(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b1) * 4 + 0,
+            (MASK as u32 & 0b1) * 4 + 1,
+            (MASK as u32 & 0b1) * 4 + 2,
+            (MASK as u32 & 0b1) * 4 + 3,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+        ],
+    );
     transmute(r)
 }
 
@@ -22076,21 +22007,16 @@ pub unsafe fn _mm256_shuffle_i32x4(a: __m256i, b: __m256i, imm8: i32) -> __m256i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i32x4&expand=5172)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b11))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shuffle_i32x4(
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b1101))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_i32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_i32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
 }
 
@@ -22099,20 +22025,15 @@ pub unsafe fn _mm256_mask_shuffle_i32x4(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i32x4&expand=5173)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b11))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shuffle_i32x4(
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b1101))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_i32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_i32x4::<MASK>(a, b);
     let zero = _mm256_setzero_si256().as_i32x8();
     transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
 }
@@ -47798,7 +47719,7 @@ mod tests {
     unsafe fn test_mm512_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_shuffle_i32x4(a, b, 0b0000);
+        let r = _mm512_shuffle_i32x4::<0b0000>(a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
         assert_eq_m512i(r, e);
     }
@@ -47807,9 +47728,9 @@ mod tests {
     unsafe fn test_mm512_mask_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_mask_shuffle_i32x4(a, 0, a, b, 0b0000);
+        let r = _mm512_mask_shuffle_i32x4::<0b0000>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_i32x4(a, 0b11111111_11111111, a, b, 0b0000);
+        let r = _mm512_mask_shuffle_i32x4::<0b0000>(a, 0b11111111_11111111, a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
         assert_eq_m512i(r, e);
     }
@@ -47818,9 +47739,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_maskz_shuffle_i32x4(0, a, b, 0b0000);
+        let r = _mm512_maskz_shuffle_i32x4::<0b0000>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_i32x4(0b00000000_11111111, a, b, 0b0000);
+        let r = _mm512_maskz_shuffle_i32x4::<0b0000>(0b00000000_11111111, a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -47829,7 +47750,7 @@ mod tests {
     unsafe fn test_mm256_shuffle_i32x4() {
         let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm256_shuffle_i32x4(a, b, 0b00);
+        let r = _mm256_shuffle_i32x4::<0b00>(a, b);
         let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
         assert_eq_m256i(r, e);
     }
@@ -47838,9 +47759,9 @@ mod tests {
     unsafe fn test_mm256_mask_shuffle_i32x4() {
         let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm256_mask_shuffle_i32x4(a, 0, a, b, 0b00);
+        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shuffle_i32x4(a, 0b11111111, a, b, 0b00);
+        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0b11111111, a, b);
         let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
         assert_eq_m256i(r, e);
     }
@@ -47849,9 +47770,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shuffle_i32x4() {
         let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm256_maskz_shuffle_i32x4(0, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shuffle_i32x4(0b11111111, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0b11111111, a, b);
         let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
         assert_eq_m256i(r, e);
     }

From c762fec5f01d09803c0d927856f5e3a1fb2c6133 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Wed, 3 Mar 2021 23:44:49 +0000
Subject: [PATCH 036/123] shuffle_f32x4

---
 crates/core_arch/src/x86/avx512f.rs | 254 ++++++++++------------------
 1 file changed, 94 insertions(+), 160 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index cdc3f2d003..b5d49b8677 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -21909,7 +21909,7 @@ pub unsafe fn _mm_maskz_shuffle_pd<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32&expand=5177)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10010101))] //should be vshufi32x4
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_01_01_01))] //should be vshufi32x4
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
     static_assert_imm8!(MASK);
@@ -21945,7 +21945,7 @@ pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x&expand=5175)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10110101))]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
     src: __m512i,
@@ -21962,7 +21962,7 @@ pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32&expand=5176)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10110101))]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
     k: __mmask16,
@@ -21979,7 +21979,7 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i32x4&expand=5174)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vperm, MASK = 0b1001))] //should be vshufi32x4
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b11))] //should be vshufi32x4
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_imm8!(MASK);
@@ -22007,7 +22007,7 @@ pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i32x4&expand=5172)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b1101))]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
 #[rustc_legacy_const_generics(4)]
 pub unsafe fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
     src: __m256i,
@@ -22025,7 +22025,7 @@ pub unsafe fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i32x4&expand=5173)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b1101))]
+#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(
     k: __mmask8,
@@ -22234,75 +22234,35 @@ pub unsafe fn _mm256_maskz_shuffle_i64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f32x4&expand=5165)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] //should be vshuff32x4, but generate vshuff64x2
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shuffle_f32x4(a: __m512, b: __m512, imm8: i32) -> __m512 {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr,
-            $i:expr,
-            $j:expr,
-            $k:expr,
-            $l:expr,
-            $m:expr,
-            $n:expr,
-            $o:expr,
-            $p:expr
-        ) => {
-            simd_shuffle16(
-                a,
-                b,
-                [
-                    $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
-                ],
-            )
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
-                1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
-                2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
-                _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr, $i: expr, $m: expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
-                1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
-                2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
-                _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
-            }
-        };
-    }
-    match imm8 & 0x3 {
-        0 => shuffle1!(0, 1, 2, 3),
-        1 => shuffle1!(4, 5, 6, 7),
-        2 => shuffle1!(8, 9, 10, 11),
-        _ => shuffle1!(12, 13, 14, 15),
-    }
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b1011))] //should be vshuff32x4, but generate vshuff64x2
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
+    static_assert_imm8!(MASK);
+    let a = a.as_f32x16();
+    let b = b.as_f32x16();
+    let r: f32x16 = simd_shuffle16(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b11) * 4 + 0,
+            (MASK as u32 & 0b11) * 4 + 1,
+            (MASK as u32 & 0b11) * 4 + 2,
+            (MASK as u32 & 0b11) * 4 + 3,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+            ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+            ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+        ],
+    );
+    transmute(r)
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22310,21 +22270,16 @@ pub unsafe fn _mm512_shuffle_f32x4(a: __m512, b: __m512, imm8: i32) -> __m512 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32&expand=5163)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shuffle_f32x4(
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_f32x4<const MASK: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
-    imm8: i32,
 ) -> __m512 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_f32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
 }
 
@@ -22333,15 +22288,15 @@ pub unsafe fn _mm512_mask_shuffle_f32x4(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32&expand=5164)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_f32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_f32x4<const MASK: i32>(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+) -> __m512 {
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f32x4::<MASK>(a, b);
     let zero = _mm512_setzero_ps().as_f32x16();
     transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
 }
@@ -22351,40 +22306,26 @@ pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_f32x4&expand=5162)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshuff32x4
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shuffle_f32x4(a: __m256, b: __m256, imm8: i32) -> __m256 {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff32x4
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_imm8!(MASK);
     let a = a.as_f32x8();
     let b = b.as_f32x8();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr, $c: expr, $d: expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, $c, $d, 8, 9, 10, 11),
-                _ => shuffle2!($a, $b, $c, $d, 12, 13, 14, 15),
-            }
-        };
-    }
-    let r: f32x8 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1, 2, 3),
-        _ => shuffle1!(4, 5, 6, 7),
-    };
-
+    let r: f32x8 = simd_shuffle8(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b1) * 4 + 0,
+            (MASK as u32 & 0b1) * 4 + 1,
+            (MASK as u32 & 0b1) * 4 + 2,
+            (MASK as u32 & 0b1) * 4 + 3,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+            ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+        ],
+    );
     transmute(r)
 }
 
@@ -22393,21 +22334,15 @@ pub unsafe fn _mm256_shuffle_f32x4(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_f32x4&expand=5160)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b11))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shuffle_f32x4(
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
     src: __m256,
     k: __mmask8,
     a: __m256,
     b: __m256,
-    imm8: i32,
 ) -> __m256 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_f32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm256_shuffle_f32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
 }
 
@@ -22416,15 +22351,14 @@ pub unsafe fn _mm256_mask_shuffle_f32x4(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_f32x4&expand=5161)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b11))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shuffle_f32x4(k: __mmask8, a: __m256, b: __m256, imm8: i32) -> __m256 {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_f32x4(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(
+    k: __mmask8,
+    a: __m256,
+    b: __m256,
+) -> __m256 {
+    let r = _mm256_shuffle_f32x4::<MASK>(a, b);
     let zero = _mm256_setzero_ps().as_f32x8();
     transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
 }
@@ -47719,7 +47653,7 @@ mod tests {
     unsafe fn test_mm512_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_shuffle_i32x4::<0b0000>(a, b);
+        let r = _mm512_shuffle_i32x4::<0b00_00_00_00>(a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
         assert_eq_m512i(r, e);
     }
@@ -47728,9 +47662,9 @@ mod tests {
     unsafe fn test_mm512_mask_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_mask_shuffle_i32x4::<0b0000>(a, 0, a, b);
+        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_i32x4::<0b0000>(a, 0b11111111_11111111, a, b);
+        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
         assert_eq_m512i(r, e);
     }
@@ -47739,9 +47673,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shuffle_i32x4() {
         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_maskz_shuffle_i32x4::<0b0000>(0, a, b);
+        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_i32x4::<0b0000>(0b00000000_11111111, a, b);
+        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -47785,7 +47719,7 @@ mod tests {
         let b = _mm512_setr_ps(
             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
         );
-        let r = _mm512_shuffle_f32x4(a, b, 0b00000000);
+        let r = _mm512_shuffle_f32x4::<0b00_00_00_00>(a, b);
         let e = _mm512_setr_ps(
             1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
         );
@@ -47800,9 +47734,9 @@ mod tests {
         let b = _mm512_setr_ps(
             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
         );
-        let r = _mm512_mask_shuffle_f32x4(a, 0, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0, a, b);
         assert_eq_m512(r, a);
-        let r = _mm512_mask_shuffle_f32x4(a, 0b11111111_11111111, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
         let e = _mm512_setr_ps(
             1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
         );
@@ -47817,9 +47751,9 @@ mod tests {
         let b = _mm512_setr_ps(
             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
         );
-        let r = _mm512_maskz_shuffle_f32x4(0, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0, a, b);
         assert_eq_m512(r, _mm512_setzero_ps());
-        let r = _mm512_maskz_shuffle_f32x4(0b00000000_11111111, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
         let e = _mm512_setr_ps(
             1., 4., 5., 8., 1., 4., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
         );
@@ -47830,7 +47764,7 @@ mod tests {
     unsafe fn test_mm256_shuffle_f32x4() {
         let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_shuffle_f32x4(a, b, 0b00);
+        let r = _mm256_shuffle_f32x4::<0b00>(a, b);
         let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
         assert_eq_m256(r, e);
     }
@@ -47839,9 +47773,9 @@ mod tests {
     unsafe fn test_mm256_mask_shuffle_f32x4() {
         let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_mask_shuffle_f32x4(a, 0, a, b, 0b00);
+        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0, a, b);
         assert_eq_m256(r, a);
-        let r = _mm256_mask_shuffle_f32x4(a, 0b11111111, a, b, 0b00);
+        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0b11111111, a, b);
         let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
         assert_eq_m256(r, e);
     }
@@ -47850,9 +47784,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shuffle_f32x4() {
         let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm256_maskz_shuffle_f32x4(0, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0, a, b);
         assert_eq_m256(r, _mm256_setzero_ps());
-        let r = _mm256_maskz_shuffle_f32x4(0b11111111, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0b11111111, a, b);
         let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
         assert_eq_m256(r, e);
     }

From e4715647647f1c02ecf77557e62fb6195bd2df7c Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 00:18:12 +0000
Subject: [PATCH 037/123] shuffle_i64x2

---
 crates/core_arch/src/x86/avx512f.rs    | 173 ++++++++-----------------
 crates/core_arch/src/x86_64/avx512f.rs |  20 +--
 2 files changed, 61 insertions(+), 132 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index b5d49b8677..c00dbaea21 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -22043,61 +22043,27 @@ pub unsafe fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i64x2&expand=5183)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shuffle_i64x2(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, 8, 9),
-                1 => shuffle3!($a, $b, $e, $f, 10, 11),
-                2 => shuffle3!($a, $b, $e, $f, 12, 13),
-                _ => shuffle3!($a, $b, $e, $f, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, 0, 1),
-                1 => shuffle2!($a, $e, 2, 3),
-                2 => shuffle2!($a, $e, 4, 5),
-                _ => shuffle2!($a, $e, 6, 7),
-            }
-        };
-    }
-    match imm8 & 0x3 {
-        0 => shuffle1!(0, 1),
-        1 => shuffle1!(2, 3),
-        2 => shuffle1!(4, 5),
-        _ => shuffle1!(6, 7),
-    }
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(MASK);
+    let a = a.as_i64x8();
+    let b = b.as_i64x8();
+    let r: i64x8 = simd_shuffle8(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b11) * 2 + 0,
+            (MASK as u32 & 0b11) * 2 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+        ],
+    );
+    transmute(r)
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22105,21 +22071,15 @@ pub unsafe fn _mm512_shuffle_i64x2(a: __m512i, b: __m512i, imm8: i32) -> __m512i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i64x&expand=5181)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shuffle_i64x2(
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_i64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm512_shuffle_i64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
 }
 
@@ -22128,20 +22088,14 @@ pub unsafe fn _mm512_mask_shuffle_i64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i64&expand=5182)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shuffle_i64x2(
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(
     k: __mmask8,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_i64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm512_shuffle_i64x2::<MASK>(a, b);
     let zero = _mm512_setzero_si512().as_i64x8();
     transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
 }
@@ -22151,35 +22105,22 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i64x2&expand=5180)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshufi64x2
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shuffle_i64x2(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshufi64x2
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(MASK);
     let a = a.as_i64x4();
     let b = b.as_i64x4();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr
-        ) => {
-            simd_shuffle4(a, b, [$a, $b, $c, $d])
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, 4, 5),
-                _ => shuffle2!($a, $b, 6, 7),
-            }
-        };
-    }
-    let r: i64x4 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1),
-        _ => shuffle1!(2, 3),
-    };
+    let r: i64x4 = simd_shuffle4(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b1) * 2 + 0,
+            (MASK as u32 & 0b1) * 2 + 1,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+        ],
+    );
     transmute(r)
 }
 
@@ -22188,21 +22129,15 @@ pub unsafe fn _mm256_shuffle_i64x2(a: __m256i, b: __m256i, imm8: i32) -> __m256i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i64x2&expand=5178)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b11))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shuffle_i64x2(
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_i64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm256_shuffle_i64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
 }
 
@@ -22211,20 +22146,14 @@ pub unsafe fn _mm256_mask_shuffle_i64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i64x2&expand=5179)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b11))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shuffle_i64x2(
+#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_i64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = _mm256_shuffle_i64x2::<MASK>(a, b);
     let zero = _mm256_setzero_si256().as_i64x4();
     transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
 }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 2db8a430d4..6d816b86c3 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -9658,7 +9658,7 @@ mod tests {
     unsafe fn test_mm512_shuffle_i64x2() {
         let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_shuffle_i64x2(a, b, 0b00000000);
+        let r = _mm512_shuffle_i64x2::<0b00_00_00_00>(a, b);
         let e = _mm512_setr_epi64(1, 4, 1, 4, 2, 3, 2, 3);
         assert_eq_m512i(r, e);
     }
@@ -9667,9 +9667,9 @@ mod tests {
     unsafe fn test_mm512_mask_shuffle_i64x2() {
         let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_mask_shuffle_i64x2(a, 0, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_i64x2::<0b00_00_00_00>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shuffle_i64x2(a, 0b11111111, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_i64x2::<0b00_00_00_00>(a, 0b11111111, a, b);
         let e = _mm512_setr_epi64(1, 4, 1, 4, 2, 3, 2, 3);
         assert_eq_m512i(r, e);
     }
@@ -9678,9 +9678,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shuffle_i64x2() {
         let a = _mm512_setr_epi64(1, 4, 5, 8, 9, 12, 13, 16);
         let b = _mm512_setr_epi64(2, 3, 6, 7, 10, 11, 14, 15);
-        let r = _mm512_maskz_shuffle_i64x2(0, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_i64x2::<0b00_00_00_00>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shuffle_i64x2(0b00001111, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_i64x2::<0b00_00_00_00>(0b00001111, a, b);
         let e = _mm512_setr_epi64(1, 4, 1, 4, 0, 0, 0, 0);
         assert_eq_m512i(r, e);
     }
@@ -9689,7 +9689,7 @@ mod tests {
     unsafe fn test_mm256_shuffle_i64x2() {
         let a = _mm256_set_epi64x(1, 4, 5, 8);
         let b = _mm256_set_epi64x(2, 3, 6, 7);
-        let r = _mm256_shuffle_i64x2(a, b, 0b00);
+        let r = _mm256_shuffle_i64x2::<0b00>(a, b);
         let e = _mm256_set_epi64x(6, 7, 5, 8);
         assert_eq_m256i(r, e);
     }
@@ -9698,9 +9698,9 @@ mod tests {
     unsafe fn test_mm256_mask_shuffle_i64x2() {
         let a = _mm256_set_epi64x(1, 4, 5, 8);
         let b = _mm256_set_epi64x(2, 3, 6, 7);
-        let r = _mm256_mask_shuffle_i64x2(a, 0, a, b, 0b00);
+        let r = _mm256_mask_shuffle_i64x2::<0b00>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shuffle_i64x2(a, 0b00001111, a, b, 0b00);
+        let r = _mm256_mask_shuffle_i64x2::<0b00>(a, 0b00001111, a, b);
         let e = _mm256_set_epi64x(6, 7, 5, 8);
         assert_eq_m256i(r, e);
     }
@@ -9709,9 +9709,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shuffle_i64x2() {
         let a = _mm256_set_epi64x(1, 4, 5, 8);
         let b = _mm256_set_epi64x(2, 3, 6, 7);
-        let r = _mm256_maskz_shuffle_i64x2(0, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_i64x2::<0b00>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shuffle_i64x2(0b00001111, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_i64x2::<0b00>(0b00001111, a, b);
         let e = _mm256_set_epi64x(6, 7, 5, 8);
         assert_eq_m256i(r, e);
     }

From b93d2252a3d3106980105aa5d30615a754804d6f Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 00:41:22 +0000
Subject: [PATCH 038/123] shuffle_f64x2

---
 crates/core_arch/src/x86/avx512f.rs    | 197 ++++++++++---------------
 crates/core_arch/src/x86_64/avx512f.rs |  20 +--
 2 files changed, 85 insertions(+), 132 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index c00dbaea21..5abe23e093 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -21692,6 +21692,7 @@ pub unsafe fn _mm512_mask_shuffle_ps<const MASK: i32>(
     a: __m512,
     b: __m512,
 ) -> __m512 {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_ps::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
 }
@@ -21708,6 +21709,7 @@ pub unsafe fn _mm512_maskz_shuffle_ps<const MASK: i32>(
     a: __m512,
     b: __m512,
 ) -> __m512 {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_ps::<MASK>(a, b);
     let zero = _mm512_setzero_ps().as_f32x16();
     transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
@@ -21726,6 +21728,7 @@ pub unsafe fn _mm256_mask_shuffle_ps<const MASK: i32>(
     a: __m256,
     b: __m256,
 ) -> __m256 {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_ps::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
 }
@@ -21742,6 +21745,7 @@ pub unsafe fn _mm256_maskz_shuffle_ps<const MASK: i32>(
     a: __m256,
     b: __m256,
 ) -> __m256 {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_ps::<MASK>(a, b);
     let zero = _mm256_setzero_ps().as_f32x8();
     transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
@@ -21760,6 +21764,7 @@ pub unsafe fn _mm_mask_shuffle_ps<const MASK: i32>(
     a: __m128,
     b: __m128,
 ) -> __m128 {
+    static_assert_imm8!(MASK);
     let r = _mm_shuffle_ps::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
 }
@@ -21772,6 +21777,7 @@ pub unsafe fn _mm_mask_shuffle_ps<const MASK: i32>(
 #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    static_assert_imm8!(MASK);
     let r = _mm_shuffle_ps::<MASK>(a, b);
     let zero = _mm_setzero_ps().as_f32x4();
     transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
@@ -21815,6 +21821,7 @@ pub unsafe fn _mm512_mask_shuffle_pd<const MASK: i32>(
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_pd::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
 }
@@ -21831,6 +21838,7 @@ pub unsafe fn _mm512_maskz_shuffle_pd<const MASK: i32>(
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_pd::<MASK>(a, b);
     let zero = _mm512_setzero_pd().as_f64x8();
     transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
@@ -21849,6 +21857,7 @@ pub unsafe fn _mm256_mask_shuffle_pd<const MASK: i32>(
     a: __m256d,
     b: __m256d,
 ) -> __m256d {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_pd::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
 }
@@ -21865,6 +21874,7 @@ pub unsafe fn _mm256_maskz_shuffle_pd<const MASK: i32>(
     a: __m256d,
     b: __m256d,
 ) -> __m256d {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_pd::<MASK>(a, b);
     let zero = _mm256_setzero_pd().as_f64x4();
     transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
@@ -21883,6 +21893,7 @@ pub unsafe fn _mm_mask_shuffle_pd<const MASK: i32>(
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
+    static_assert_imm8!(MASK);
     let r = _mm_shuffle_pd::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
 }
@@ -21899,6 +21910,7 @@ pub unsafe fn _mm_maskz_shuffle_pd<const MASK: i32>(
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
+    static_assert_imm8!(MASK);
     let r = _mm_shuffle_pd::<MASK>(a, b);
     let zero = _mm_setzero_pd().as_f64x2();
     transmute(simd_select_bitmask(k, r.as_f64x2(), zero))
@@ -21953,6 +21965,7 @@ pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_i32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
 }
@@ -21969,6 +21982,7 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_i32x4::<MASK>(a, b);
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
@@ -22079,6 +22093,7 @@ pub unsafe fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_i64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
 }
@@ -22095,6 +22110,7 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
+    static_assert_imm8!(MASK);
     let r = _mm512_shuffle_i64x2::<MASK>(a, b);
     let zero = _mm512_setzero_si512().as_i64x8();
     transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
@@ -22137,6 +22153,7 @@ pub unsafe fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_i64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
 }
@@ -22153,6 +22170,7 @@ pub unsafe fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_i64x2::<MASK>(a, b);
     let zero = _mm256_setzero_si256().as_i64x4();
     transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
@@ -22271,6 +22289,7 @@ pub unsafe fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
     a: __m256,
     b: __m256,
 ) -> __m256 {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_f32x4::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
 }
@@ -22287,6 +22306,7 @@ pub unsafe fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(
     a: __m256,
     b: __m256,
 ) -> __m256 {
+    static_assert_imm8!(MASK);
     let r = _mm256_shuffle_f32x4::<MASK>(a, b);
     let zero = _mm256_setzero_ps().as_f32x8();
     transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
@@ -22297,61 +22317,27 @@ pub unsafe fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f64x2&expand=5171)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shuffle_f64x2(a: __m512d, b: __m512d, imm8: i32) -> __m512d {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
-    macro_rules! shuffle4 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr,
-            $e:expr,
-            $f:expr,
-            $g:expr,
-            $h:expr
-        ) => {
-            simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
-        };
-    }
-    macro_rules! shuffle3 {
-        ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
-            match (imm8 >> 6) & 0x3 {
-                0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
-                1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
-                2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
-                _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle2 {
-        ($a:expr, $b:expr, $e:expr, $f:expr) => {
-            match (imm8 >> 4) & 0x3 {
-                0 => shuffle3!($a, $b, $e, $f, 8, 9),
-                1 => shuffle3!($a, $b, $e, $f, 10, 11),
-                2 => shuffle3!($a, $b, $e, $f, 12, 13),
-                _ => shuffle3!($a, $b, $e, $f, 14, 15),
-            }
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $e:expr) => {
-            match (imm8 >> 2) & 0x3 {
-                0 => shuffle2!($a, $e, 0, 1),
-                1 => shuffle2!($a, $e, 2, 3),
-                2 => shuffle2!($a, $e, 4, 5),
-                _ => shuffle2!($a, $e, 6, 7),
-            }
-        };
-    }
-    match imm8 & 0x3 {
-        0 => shuffle1!(0, 1),
-        1 => shuffle1!(2, 3),
-        2 => shuffle1!(4, 5),
-        _ => shuffle1!(6, 7),
-    }
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
+    static_assert_imm8!(MASK);
+    let a = a.as_f64x8();
+    let b = b.as_f64x8();
+    let r: f64x8 = simd_shuffle8(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b11) * 2 + 0,
+            (MASK as u32 & 0b11) * 2 + 1,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+            ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+            ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+        ],
+    );
+    transmute(r)
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22359,21 +22345,16 @@ pub unsafe fn _mm512_shuffle_f64x2(a: __m512d, b: __m512d, imm8: i32) -> __m512d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5169)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shuffle_f64x2(
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shuffle_f64x2<const MASK: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
-    imm8: i32,
 ) -> __m512d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_f64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
 }
 
@@ -22382,20 +22363,15 @@ pub unsafe fn _mm512_mask_shuffle_f64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5170)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shuffle_f64x2(
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shuffle_f64x2<const MASK: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
-    imm8: i32,
 ) -> __m512d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shuffle_f64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm512_shuffle_f64x2::<MASK>(a, b);
     let zero = _mm512_setzero_pd().as_f64x8();
     transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
 }
@@ -22405,35 +22381,22 @@ pub unsafe fn _mm512_maskz_shuffle_f64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_f64x2&expand=5168)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vperm, imm8 = 0b01))] //should be vshuff64x2
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shuffle_f64x2(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let imm8 = (imm8 & 0xFF) as u8;
+#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff64x2
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_imm8!(MASK);
     let a = a.as_f64x4();
     let b = b.as_f64x4();
-    macro_rules! shuffle2 {
-        (
-            $a:expr,
-            $b:expr,
-            $c:expr,
-            $d:expr
-        ) => {
-            simd_shuffle4(a, b, [$a, $b, $c, $d])
-        };
-    }
-    macro_rules! shuffle1 {
-        ($a:expr, $b:expr) => {
-            match (imm8 >> 1) & 0x1 {
-                0 => shuffle2!($a, $b, 4, 5),
-                _ => shuffle2!($a, $b, 6, 7),
-            }
-        };
-    }
-    let r: f64x4 = match imm8 & 0x1 {
-        0 => shuffle1!(0, 1),
-        _ => shuffle1!(2, 3),
-    };
+    let r: f64x4 = simd_shuffle4(
+        a,
+        b,
+        [
+            (MASK as u32 & 0b1) * 2 + 0,
+            (MASK as u32 & 0b1) * 2 + 1,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+            ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+        ],
+    );
     transmute(r)
 }
 
@@ -22442,21 +22405,16 @@ pub unsafe fn _mm256_shuffle_f64x2(a: __m256d, b: __m256d, imm8: i32) -> __m256d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_f64x2&expand=5166)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b11))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shuffle_f64x2(
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shuffle_f64x2<const MASK: i32>(
     src: __m256d,
     k: __mmask8,
     a: __m256d,
     b: __m256d,
-    imm8: i32,
 ) -> __m256d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_f64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_f64x2::<MASK>(a, b);
     transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
 }
 
@@ -22465,20 +22423,15 @@ pub unsafe fn _mm256_mask_shuffle_f64x2(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_f64x2&expand=5167)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b11))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shuffle_f64x2(
+#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(
     k: __mmask8,
     a: __m256d,
     b: __m256d,
-    imm8: i32,
 ) -> __m256d {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shuffle_f64x2(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(MASK);
+    let r = _mm256_shuffle_f64x2::<MASK>(a, b);
     let zero = _mm256_setzero_pd().as_f64x4();
     transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
 }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 6d816b86c3..9ad35f7166 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -9720,7 +9720,7 @@ mod tests {
     unsafe fn test_mm512_shuffle_f64x2() {
         let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm512_shuffle_f64x2(a, b, 0b00000000);
+        let r = _mm512_shuffle_f64x2::<0b00_00_00_00>(a, b);
         let e = _mm512_setr_pd(1., 4., 1., 4., 2., 3., 2., 3.);
         assert_eq_m512d(r, e);
     }
@@ -9729,9 +9729,9 @@ mod tests {
     unsafe fn test_mm512_mask_shuffle_f64x2() {
         let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm512_mask_shuffle_f64x2(a, 0, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_f64x2::<0b00_00_00_00>(a, 0, a, b);
         assert_eq_m512d(r, a);
-        let r = _mm512_mask_shuffle_f64x2(a, 0b11111111, a, b, 0b00000000);
+        let r = _mm512_mask_shuffle_f64x2::<0b00_00_00_00>(a, 0b11111111, a, b);
         let e = _mm512_setr_pd(1., 4., 1., 4., 2., 3., 2., 3.);
         assert_eq_m512d(r, e);
     }
@@ -9740,9 +9740,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shuffle_f64x2() {
         let a = _mm512_setr_pd(1., 4., 5., 8., 9., 12., 13., 16.);
         let b = _mm512_setr_pd(2., 3., 6., 7., 10., 11., 14., 15.);
-        let r = _mm512_maskz_shuffle_f64x2(0, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_f64x2::<0b00_00_00_00>(0, a, b);
         assert_eq_m512d(r, _mm512_setzero_pd());
-        let r = _mm512_maskz_shuffle_f64x2(0b00001111, a, b, 0b00000000);
+        let r = _mm512_maskz_shuffle_f64x2::<0b00_00_00_00>(0b00001111, a, b);
         let e = _mm512_setr_pd(1., 4., 1., 4., 0., 0., 0., 0.);
         assert_eq_m512d(r, e);
     }
@@ -9751,7 +9751,7 @@ mod tests {
     unsafe fn test_mm256_shuffle_f64x2() {
         let a = _mm256_set_pd(1., 4., 5., 8.);
         let b = _mm256_set_pd(2., 3., 6., 7.);
-        let r = _mm256_shuffle_f64x2(a, b, 0b00);
+        let r = _mm256_shuffle_f64x2::<0b00>(a, b);
         let e = _mm256_set_pd(6., 7., 5., 8.);
         assert_eq_m256d(r, e);
     }
@@ -9760,9 +9760,9 @@ mod tests {
     unsafe fn test_mm256_mask_shuffle_f64x2() {
         let a = _mm256_set_pd(1., 4., 5., 8.);
         let b = _mm256_set_pd(2., 3., 6., 7.);
-        let r = _mm256_mask_shuffle_f64x2(a, 0, a, b, 0b00);
+        let r = _mm256_mask_shuffle_f64x2::<0b00>(a, 0, a, b);
         assert_eq_m256d(r, a);
-        let r = _mm256_mask_shuffle_f64x2(a, 0b00001111, a, b, 0b00);
+        let r = _mm256_mask_shuffle_f64x2::<0b00>(a, 0b00001111, a, b);
         let e = _mm256_set_pd(6., 7., 5., 8.);
         assert_eq_m256d(r, e);
     }
@@ -9771,9 +9771,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shuffle_f64x2() {
         let a = _mm256_set_pd(1., 4., 5., 8.);
         let b = _mm256_set_pd(2., 3., 6., 7.);
-        let r = _mm256_maskz_shuffle_f64x2(0, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_f64x2::<0b00>(0, a, b);
         assert_eq_m256d(r, _mm256_setzero_pd());
-        let r = _mm256_maskz_shuffle_f64x2(0b00001111, a, b, 0b00);
+        let r = _mm256_maskz_shuffle_f64x2::<0b00>(0b00001111, a, b);
         let e = _mm256_set_pd(6., 7., 5., 8.);
         assert_eq_m256d(r, e);
     }

From cc48c224c14895451a4af14bc028d212a45f3cdb Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 14:21:46 +0000
Subject: [PATCH 039/123] mm_cvtt_roundss,sd_u64,i64,si64;
 mm_cvt_roundss,sd_u64,i64,si64; mm_cvt_roundu64,i64,si64_ss,sd

---
 crates/core_arch/src/x86_64/avx512f.rs | 270 +++++++++----------------
 1 file changed, 90 insertions(+), 180 deletions(-)

diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 9ad35f7166..43906f7714 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -145,16 +145,11 @@ pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_sd&expand=1313)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2sd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2sd64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2sd64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -169,16 +164,11 @@ pub unsafe fn _mm_cvt_roundi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_sd&expand=1367)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2sd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundsi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2sd64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2sd64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -193,16 +183,11 @@ pub unsafe fn _mm_cvt_roundsi64_sd(a: __m128d, b: i64, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi64_ss&expand=1314)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2ss64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2ss64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -217,16 +202,11 @@ pub unsafe fn _mm_cvt_roundi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_sd&expand=1379)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtusi2sd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundu64_sd(a: __m128d, b: u64, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vcvtusi2sd, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __m128d {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtusi2sd64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtusi2sd64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -241,16 +221,11 @@ pub unsafe fn _mm_cvt_roundu64_sd(a: __m128d, b: u64, rounding: i32) -> __m128d
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsi64_ss&expand=1368)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundsi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsi2ss64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsi2ss64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -265,16 +240,11 @@ pub unsafe fn _mm_cvt_roundsi64_ss(a: __m128, b: i64, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu64_ss&expand=1380)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtusi2ss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_cvt_roundu64_ss(a: __m128, b: u64, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m128 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtusi2ss64(a, b, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtusi2ss64(a, b, ROUNDING);
     transmute(r)
 }
 
@@ -289,16 +259,11 @@ pub unsafe fn _mm_cvt_roundu64_ss(a: __m128, b: u64, rounding: i32) -> __m128 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_si64&expand=1360)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_si64(a: __m128d, rounding: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2si64(a, ROUNDING);
     transmute(r)
 }
 
@@ -313,16 +278,11 @@ pub unsafe fn _mm_cvt_roundsd_si64(a: __m128d, rounding: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_i64&expand=1358)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_i64(a: __m128d, rounding: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2si64(a, ROUNDING);
     transmute(r)
 }
 
@@ -337,16 +297,11 @@ pub unsafe fn _mm_cvt_roundsd_i64(a: __m128d, rounding: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_u64&expand=1365)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundsd_u64(a: __m128d, rounding: i32) -> u64 {
+#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtsd2usi64(a, ROUNDING);
     transmute(r)
 }
 
@@ -361,16 +316,11 @@ pub unsafe fn _mm_cvt_roundsd_u64(a: __m128d, rounding: i32) -> u64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_si64&expand=1375)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_si64(a: __m128, rounding: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2si64(a, ROUNDING);
     transmute(r)
 }
 
@@ -385,16 +335,11 @@ pub unsafe fn _mm_cvt_roundss_si64(a: __m128, rounding: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_i64&expand=1370)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_i64(a: __m128, rounding: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2si64(a, ROUNDING);
     transmute(r)
 }
 
@@ -409,16 +354,11 @@ pub unsafe fn _mm_cvt_roundss_i64(a: __m128, rounding: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_u64&expand=1377)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, rounding = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvt_roundss_u64(a: __m128, rounding: i32) -> u64 {
+#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_round!(rounding, call);
+    let r = vcvtss2usi64(a, ROUNDING);
     transmute(r)
 }
 
@@ -428,16 +368,11 @@ pub unsafe fn _mm_cvt_roundss_u64(a: __m128, rounding: i32) -> u64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si64&expand=1931)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_si64(a: __m128d, sae: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2si64(a, SAE);
     transmute(r)
 }
 
@@ -447,16 +382,11 @@ pub unsafe fn _mm_cvtt_roundsd_si64(a: __m128d, sae: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i64&expand=1929)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_i64(a: __m128d, sae: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2si64(a, SAE);
     transmute(r)
 }
 
@@ -466,16 +396,11 @@ pub unsafe fn _mm_cvtt_roundsd_i64(a: __m128d, sae: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_u64&expand=1933)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtsd2usi, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundsd_u64(a: __m128d, sae: i32) -> u64 {
+#[cfg_attr(test, assert_instr(vcvtsd2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
     let a = a.as_f64x2();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtsd2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtsd2usi64(a, SAE);
     transmute(r)
 }
 
@@ -485,16 +410,11 @@ pub unsafe fn _mm_cvtt_roundsd_u64(a: __m128d, sae: i32) -> u64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_i64&expand=1935)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_i64(a: __m128, sae: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2si64(a, SAE);
     transmute(r)
 }
 
@@ -504,16 +424,11 @@ pub unsafe fn _mm_cvtt_roundss_i64(a: __m128, sae: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_si64&expand=1937)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_si64(a: __m128, sae: i32) -> i64 {
+#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2si64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2si64(a, SAE);
     transmute(r)
 }
 
@@ -523,16 +438,11 @@ pub unsafe fn _mm_cvtt_roundss_si64(a: __m128, sae: i32) -> i64 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundss_u64&expand=1939)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcvtss2usi, sae = 8))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_cvtt_roundss_u64(a: __m128, sae: i32) -> u64 {
+#[cfg_attr(test, assert_instr(vcvtss2usi, SAE = 8))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
     let a = a.as_f32x4();
-    macro_rules! call {
-        ($imm4:expr) => {
-            vcvtss2usi64(a, $imm4)
-        };
-    }
-    let r = constify_imm4_sae!(sae, call);
+    let r = vcvtss2usi64(a, SAE);
     transmute(r)
 }
 
@@ -12197,7 +12107,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundi64_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: i64 = 9;
-        let r = _mm_cvt_roundi64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundi64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
@@ -12206,7 +12116,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundsi64_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: i64 = 9;
-        let r = _mm_cvt_roundsi64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsi64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
@@ -12232,7 +12142,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_si64() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_si64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_si64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i64 = -1;
         assert_eq!(r, e);
     }
@@ -12240,7 +12150,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_i64() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvt_roundsd_i64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_i64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i64 = -1;
         assert_eq!(r, e);
     }
@@ -12248,7 +12158,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundsd_u64() {
         let a = _mm_set_pd(1., f64::MAX);
-        let r = _mm_cvt_roundsd_u64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsd_u64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: u64 = u64::MAX;
         assert_eq!(r, e);
     }
@@ -12264,7 +12174,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_i64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_i64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_i64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i64 = -1;
         assert_eq!(r, e);
     }
@@ -12272,7 +12182,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_si64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_si64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_si64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: i64 = -1;
         assert_eq!(r, e);
     }
@@ -12280,7 +12190,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvt_roundss_u64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvt_roundss_u64(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundss_u64::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
         let e: u64 = u64::MAX;
         assert_eq!(r, e);
     }
@@ -12304,7 +12214,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_i64() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_i64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_i64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i64 = -2;
         assert_eq!(r, e);
     }
@@ -12312,7 +12222,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_si64() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_si64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_si64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i64 = -2;
         assert_eq!(r, e);
     }
@@ -12320,7 +12230,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundsd_u64() {
         let a = _mm_set_pd(1., -1.5);
-        let r = _mm_cvtt_roundsd_u64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundsd_u64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: u64 = u64::MAX;
         assert_eq!(r, e);
     }
@@ -12344,7 +12254,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_i64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_i64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_i64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i64 = -2;
         assert_eq!(r, e);
     }
@@ -12352,7 +12262,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_si64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_si64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_si64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: i64 = -2;
         assert_eq!(r, e);
     }
@@ -12360,7 +12270,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_cvtt_roundss_u64() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
-        let r = _mm_cvtt_roundss_u64(a, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm_cvtt_roundss_u64::<_MM_FROUND_CUR_DIRECTION>(a);
         let e: u64 = u64::MAX;
         assert_eq!(r, e);
     }
@@ -12395,7 +12305,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundu64_ss() {
         let a = _mm_set_ps(0., -0.5, 1., -1.5);
         let b: u64 = 9;
-        let r = _mm_cvt_roundu64_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundu64_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_ps(0., -0.5, 1., 9.);
         assert_eq_m128(r, e);
     }
@@ -12404,7 +12314,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundu64_sd() {
         let a = _mm_set_pd(1., -1.5);
         let b: u64 = 9;
-        let r = _mm_cvt_roundu64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundu64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 9.);
         assert_eq_m128d(r, e);
     }
@@ -12413,7 +12323,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundi64_sd() {
         let a = _mm_set_pd(1., -1.5);
         let b: i64 = 9;
-        let r = _mm_cvt_roundi64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundi64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 9.);
         assert_eq_m128d(r, e);
     }
@@ -12422,7 +12332,7 @@ mod tests {
     unsafe fn test_mm_cvt_roundsi64_sd() {
         let a = _mm_set_pd(1., -1.5);
         let b: i64 = 9;
-        let r = _mm_cvt_roundsi64_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let r = _mm_cvt_roundsi64_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
         let e = _mm_set_pd(1., 9.);
         assert_eq_m128d(r, e);
     }

From dc2774d7899b1d9720bd098bf082775ccfa24a87 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 14:41:58 +0000
Subject: [PATCH 040/123] add static_assert

---
 crates/core_arch/src/x86_64/avx512f.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 43906f7714..af62b2112c 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -148,6 +148,7 @@ pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtsi2sd64(a, b, ROUNDING);
     transmute(r)
@@ -167,6 +168,7 @@ pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __
 #[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtsi2sd64(a, b, ROUNDING);
     transmute(r)
@@ -186,6 +188,7 @@ pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> _
 #[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtsi2ss64(a, b, ROUNDING);
     transmute(r)
@@ -205,6 +208,7 @@ pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m
 #[cfg_attr(test, assert_instr(vcvtusi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __m128d {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtusi2sd64(a, b, ROUNDING);
     transmute(r)
@@ -224,6 +228,7 @@ pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __
 #[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtsi2ss64(a, b, ROUNDING);
     transmute(r)
@@ -243,6 +248,7 @@ pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __
 #[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m128 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtusi2ss64(a, b, ROUNDING);
     transmute(r)
@@ -262,6 +268,7 @@ pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m
 #[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtsd2si64(a, ROUNDING);
     transmute(r)
@@ -281,6 +288,7 @@ pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtsd2si64(a, ROUNDING);
     transmute(r)
@@ -300,6 +308,7 @@ pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f64x2();
     let r = vcvtsd2usi64(a, ROUNDING);
     transmute(r)
@@ -319,6 +328,7 @@ pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtss2si64(a, ROUNDING);
     transmute(r)
@@ -338,6 +348,7 @@ pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtss2si64(a, ROUNDING);
     transmute(r)
@@ -357,6 +368,7 @@ pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
+    static_assert_rounding!(ROUNDING);
     let a = a.as_f32x4();
     let r = vcvtss2usi64(a, ROUNDING);
     transmute(r)
@@ -371,6 +383,7 @@ pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let r = vcvtsd2si64(a, SAE);
     transmute(r)
@@ -385,6 +398,7 @@ pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let r = vcvtsd2si64(a, SAE);
     transmute(r)
@@ -399,6 +413,7 @@ pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtsd2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
+    static_assert_sae!(SAE);
     let a = a.as_f64x2();
     let r = vcvtsd2usi64(a, SAE);
     transmute(r)
@@ -413,6 +428,7 @@ pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let r = vcvtss2si64(a, SAE);
     transmute(r)
@@ -427,6 +443,7 @@ pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let r = vcvtss2si64(a, SAE);
     transmute(r)
@@ -441,6 +458,7 @@ pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtss2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 pub unsafe fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
+    static_assert_sae!(SAE);
     let a = a.as_f32x4();
     let r = vcvtss2usi64(a, SAE);
     transmute(r)

From 06eb05c10c2ad77fb617f0b907124de75dd3540f Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 15:00:58 +0000
Subject: [PATCH 041/123] fix x86_64/macro

---
 crates/core_arch/src/x86_64/macros.rs | 47 ++++++++++++++-------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/crates/core_arch/src/x86_64/macros.rs b/crates/core_arch/src/x86_64/macros.rs
index e3682d40fe..cafa37dd6f 100644
--- a/crates/core_arch/src/x86_64/macros.rs
+++ b/crates/core_arch/src/x86_64/macros.rs
@@ -1,32 +1,33 @@
 //! Utility macros.
 
-// For round instructions, the only valid values for rounding are 4, 8, 9, 10 and 11.
-// This macro enforces that.
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// not a round number.
+pub(crate) struct ValidateConstRound<const IMM: i32>;
+impl<const IMM: i32> ValidateConstRound<IMM> {
+    pub(crate) const VALID: () = {
+        let _ = 1 / ((IMM == 4 || IMM == 8 || IMM == 9 || IMM == 10 || IMM == 11) as usize);
+    };
+}
+
 #[allow(unused)]
-macro_rules! constify_imm4_round {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1111 {
-            4 => $expand!(4),
-            8 => $expand!(8),
-            9 => $expand!(9),
-            10 => $expand!(10),
-            11 => $expand!(11),
-            _ => panic!("Invalid round value"),
-        }
+macro_rules! static_assert_rounding {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86_64::macros::ValidateConstRound::<$imm>::VALID;
+    };
+}
+
+// Helper struct used to trigger const eval errors when the const generic immediate value `imm` is
+// not a sae number.
+pub(crate) struct ValidateConstSae<const IMM: i32>;
+impl<const IMM: i32> ValidateConstSae<IMM> {
+    pub(crate) const VALID: () = {
+        let _ = 1 / ((IMM == 4 || IMM == 8) as usize);
     };
 }
 
-// For sae instructions, the only valid values for sae are 4 and 8.
-// This macro enforces that.
 #[allow(unused)]
-macro_rules! constify_imm4_sae {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1111 {
-            4 => $expand!(4),
-            8 => $expand!(8),
-            _ => panic!("Invalid sae value"),
-        }
+macro_rules! static_assert_sae {
+    ($imm:ident) => {
+        let _ = $crate::core_arch::x86_64::macros::ValidateConstSae::<$imm>::VALID;
     };
 }

From eacd3929832f0d9fa38c37ad1eaa0332faadc792 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 15:31:00 +0000
Subject: [PATCH 042/123] remove x86/macro imm4_sae,imm4_rounding

---
 crates/core_arch/src/x86/macros.rs | 31 ------------------------------
 1 file changed, 31 deletions(-)

diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index 4f160221d6..47ceaeb20a 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -184,37 +184,6 @@ macro_rules! constify_imm8_gather {
     };
 }
 
-// For round instructions, the only valid values for rounding are 4, 8, 9, 10 and 11.
-// This macro enforces that.
-#[allow(unused)]
-macro_rules! constify_imm4_round {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1111 {
-            4 => $expand!(4),
-            8 => $expand!(8),
-            9 => $expand!(9),
-            10 => $expand!(10),
-            11 => $expand!(11),
-            _ => panic!("Invalid round value"),
-        }
-    };
-}
-
-// For sae instructions, the only valid values for sae are 4 and 8.
-// This macro enforces that.
-#[allow(unused)]
-macro_rules! constify_imm4_sae {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b1111 {
-            4 => $expand!(4),
-            8 => $expand!(8),
-            _ => panic!("Invalid sae value"),
-        }
-    };
-}
-
 // Two mantissas parameters.
 // This macro enforces that.
 #[allow(unused)]

From 77a89e0088eb31361717d9e160f753b0dd64e8fd Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 4 Mar 2021 16:56:42 +0000
Subject: [PATCH 043/123] shldi,shrdi_epi64,epi32,epi16

---
 crates/core_arch/src/x86/avx512vbmi2.rs | 865 +++++++++++++-----------
 1 file changed, 468 insertions(+), 397 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
index 032bce9176..b7a385dd97 100644
--- a/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -920,14 +920,15 @@ pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m1
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi64&expand=5060)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shldi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shldi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshldvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     ))
 }
 
@@ -936,20 +937,20 @@ pub unsafe fn _mm512_shldi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi64&expand=5058)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shldi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shldi_epi64<const IMM8: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x8 = vpshldvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
 }
@@ -959,14 +960,19 @@ pub unsafe fn _mm512_mask_shldi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi64&expand=5059)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shldi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shldi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x8 = vpshldvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     );
     let zero = _mm512_setzero_si512().as_i64x8();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -977,14 +983,15 @@ pub unsafe fn _mm512_maskz_shldi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi64&expand=5057)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shldi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shldi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshldvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     ))
 }
 
@@ -993,20 +1000,20 @@ pub unsafe fn _mm256_shldi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi64&expand=5055)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shldi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shldi_epi64<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x4 = vpshldvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
 }
@@ -1016,14 +1023,19 @@ pub unsafe fn _mm256_mask_shldi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi64&expand=5056)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shldi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shldi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x4 = vpshldvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     );
     let zero = _mm256_setzero_si256().as_i64x4();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1034,14 +1046,15 @@ pub unsafe fn _mm256_maskz_shldi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi64&expand=5054)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shldi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shldi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshldvq128(
         a.as_i64x2(),
         b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
+        _mm_set1_epi64x(imm8).as_i64x2(),
     ))
 }
 
@@ -1050,21 +1063,17 @@ pub unsafe fn _mm_shldi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi64&expand=5052)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shldi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shldi_epi64<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i64x2 = vpshldvq128(
-        a.as_i64x2(),
-        b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
-    );
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshldvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
     transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
 }
 
@@ -1073,15 +1082,16 @@ pub unsafe fn _mm_mask_shldi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi64&expand=5053)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shldi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i64x2 = vpshldvq128(
-        a.as_i64x2(),
-        b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
-    );
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shldi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshldvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
     let zero = _mm_setzero_si128().as_i64x2();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -1091,14 +1101,14 @@ pub unsafe fn _mm_maskz_shldi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi32&expand=5051)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shldi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shldi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     transmute(vpshldvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     ))
 }
 
@@ -1107,20 +1117,19 @@ pub unsafe fn _mm512_shldi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi32&expand=5049)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shldi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shldi_epi32<const IMM8: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
     let shf: i32x16 = vpshldvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
 }
@@ -1130,14 +1139,18 @@ pub unsafe fn _mm512_mask_shldi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi32&expand=5050)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shldi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shldi_epi32<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let shf: i32x16 = vpshldvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     );
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1148,14 +1161,14 @@ pub unsafe fn _mm512_maskz_shldi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi32&expand=5048)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shldi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shldi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     transmute(vpshldvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     ))
 }
 
@@ -1164,20 +1177,19 @@ pub unsafe fn _mm256_shldi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi32&expand=5046)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shldi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shldi_epi32<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
     let shf: i32x8 = vpshldvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
 }
@@ -1187,14 +1199,18 @@ pub unsafe fn _mm256_mask_shldi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi32&expand=5047)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shldi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shldi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let shf: i32x8 = vpshldvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     );
     let zero = _mm256_setzero_si256().as_i32x8();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1205,14 +1221,14 @@ pub unsafe fn _mm256_maskz_shldi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi32&expand=5045)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shldi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shldi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     transmute(vpshldvd128(
         a.as_i32x4(),
         b.as_i32x4(),
-        _mm_set1_epi32(imm8).as_i32x4(),
+        _mm_set1_epi32(IMM8).as_i32x4(),
     ))
 }
 
@@ -1221,17 +1237,16 @@ pub unsafe fn _mm_shldi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi32&expand=5043)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shldi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shldi_epi32<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
     transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
 }
 
@@ -1240,11 +1255,15 @@ pub unsafe fn _mm_mask_shldi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi32&expand=5044)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shldi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shldi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
     let zero = _mm_setzero_si128().as_i32x4();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -1254,14 +1273,15 @@ pub unsafe fn _mm_maskz_shldi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi16&expand=5042)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shldi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shldi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     transmute(vpshldvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     ))
 }
 
@@ -1270,20 +1290,20 @@ pub unsafe fn _mm512_shldi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi16&expand=5040)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shldi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shldi_epi16<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     let shf: i16x32 = vpshldvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
 }
@@ -1293,14 +1313,19 @@ pub unsafe fn _mm512_mask_shldi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi16&expand=5041)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shldi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shldi_epi16<const IMM8: i32>(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     let shf: i16x32 = vpshldvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     );
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1311,14 +1336,15 @@ pub unsafe fn _mm512_maskz_shldi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi16&expand=5039)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shldi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shldi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     transmute(vpshldvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     ))
 }
 
@@ -1327,20 +1353,20 @@ pub unsafe fn _mm256_shldi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi16&expand=5037)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shldi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shldi_epi16<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     let shf: i16x16 = vpshldvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
 }
@@ -1350,13 +1376,19 @@ pub unsafe fn _mm256_mask_shldi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi16&expand=5038)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shldi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shldi_epi16<const IMM8: i32>(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     let shf: i16x16 = vpshldvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     );
     let zero = _mm256_setzero_si256().as_i16x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1367,13 +1399,15 @@ pub unsafe fn _mm256_maskz_shldi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi16&expand=5036)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shldi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shldi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     transmute(vpshldvw128(
         a.as_i16x8(),
         b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
+        _mm_set1_epi16(imm8).as_i16x8(),
     ))
 }
 
@@ -1382,20 +1416,17 @@ pub unsafe fn _mm_shldi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi16&expand=5034)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shldi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shldi_epi16<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    let shf: i16x8 = vpshldvw128(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
-    );
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshldvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
     transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
 }
 
@@ -1404,14 +1435,16 @@ pub unsafe fn _mm_mask_shldi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi16&expand=5035)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shldi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    let shf: i16x8 = vpshldvw128(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
-    );
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shldi_epi16<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshldvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
     let zero = _mm_setzero_si128().as_i16x8();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -1421,14 +1454,15 @@ pub unsafe fn _mm_maskz_shldi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi64&expand=5114)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shrdi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shrdi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshrdvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     ))
 }
 
@@ -1437,20 +1471,20 @@ pub unsafe fn _mm512_shrdi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi64&expand=5112)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shrdi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shrdi_epi64<const IMM8: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x8 = vpshrdvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
 }
@@ -1460,14 +1494,19 @@ pub unsafe fn _mm512_mask_shrdi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi64&expand=5113)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 255))] //should be vpshrdq
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shrdi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 255))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x8 = vpshrdvq(
         a.as_i64x8(),
         b.as_i64x8(),
-        _mm512_set1_epi64(imm8 as i64).as_i64x8(),
+        _mm512_set1_epi64(imm8).as_i64x8(),
     );
     let zero = _mm512_setzero_si512().as_i64x8();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1478,14 +1517,15 @@ pub unsafe fn _mm512_maskz_shrdi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi64&expand=5111)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shrdi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shrdi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshrdvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     ))
 }
 
@@ -1494,20 +1534,20 @@ pub unsafe fn _mm256_shrdi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi64&expand=5109)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shrdi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shrdi_epi64<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x4 = vpshrdvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
 }
@@ -1517,14 +1557,19 @@ pub unsafe fn _mm256_mask_shrdi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi64&expand=5110)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shrdi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     let shf: i64x4 = vpshrdvq256(
         a.as_i64x4(),
         b.as_i64x4(),
-        _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
+        _mm256_set1_epi64x(imm8).as_i64x4(),
     );
     let zero = _mm256_setzero_si256().as_i64x4();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1535,14 +1580,15 @@ pub unsafe fn _mm256_maskz_shrdi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi64&expand=5108)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shrdi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shrdi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
     transmute(vpshrdvq128(
         a.as_i64x2(),
         b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
+        _mm_set1_epi64x(imm8).as_i64x2(),
     ))
 }
 
@@ -1551,21 +1597,17 @@ pub unsafe fn _mm_shrdi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi64&expand=5106)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shrdi_epi64(
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shrdi_epi64<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i64x2 = vpshrdvq128(
-        a.as_i64x2(),
-        b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
-    );
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshrdvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
     transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
 }
 
@@ -1574,15 +1616,16 @@ pub unsafe fn _mm_mask_shrdi_epi64(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi64&expand=5107)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shrdi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i64x2 = vpshrdvq128(
-        a.as_i64x2(),
-        b.as_i64x2(),
-        _mm_set1_epi64x(imm8 as i64).as_i64x2(),
-    );
+#[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shrdi_epi64<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i64;
+    let shf: i64x2 = vpshrdvq128(a.as_i64x2(), b.as_i64x2(), _mm_set1_epi64x(imm8).as_i64x2());
     let zero = _mm_setzero_si128().as_i64x2();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -1592,14 +1635,14 @@ pub unsafe fn _mm_maskz_shrdi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi32&expand=5105)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shrdi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shrdi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     transmute(vpshrdvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     ))
 }
 
@@ -1608,20 +1651,19 @@ pub unsafe fn _mm512_shrdi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi32&expand=5103)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shrdi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shrdi_epi32<const IMM8: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
     let shf: i32x16 = vpshrdvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
 }
@@ -1631,14 +1673,18 @@ pub unsafe fn _mm512_mask_shrdi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi32&expand=5104)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shrdi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let shf: i32x16 = vpshrdvd(
         a.as_i32x16(),
         b.as_i32x16(),
-        _mm512_set1_epi32(imm8).as_i32x16(),
+        _mm512_set1_epi32(IMM8).as_i32x16(),
     );
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1649,14 +1695,14 @@ pub unsafe fn _mm512_maskz_shrdi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi32&expand=5102)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shrdi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shrdi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     transmute(vpshrdvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     ))
 }
 
@@ -1665,20 +1711,19 @@ pub unsafe fn _mm256_shrdi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi32&expand=5100)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shrdi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shrdi_epi32<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+    static_assert_imm8!(IMM8);
     let shf: i32x8 = vpshrdvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
 }
@@ -1688,14 +1733,18 @@ pub unsafe fn _mm256_mask_shrdi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi32&expand=5101)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shrdi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let shf: i32x8 = vpshrdvd256(
         a.as_i32x8(),
         b.as_i32x8(),
-        _mm256_set1_epi32(imm8).as_i32x8(),
+        _mm256_set1_epi32(IMM8).as_i32x8(),
     );
     let zero = _mm256_setzero_si256().as_i32x8();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1706,14 +1755,14 @@ pub unsafe fn _mm256_maskz_shrdi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi32&expand=5099)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shrdi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shrdi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     transmute(vpshrdvd128(
         a.as_i32x4(),
         b.as_i32x4(),
-        _mm_set1_epi32(imm8).as_i32x4(),
+        _mm_set1_epi32(IMM8).as_i32x4(),
     ))
 }
 
@@ -1722,17 +1771,16 @@ pub unsafe fn _mm_shrdi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi32&expand=5097)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shrdi_epi32(
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shrdi_epi32<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
     transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
 }
 
@@ -1741,11 +1789,15 @@ pub unsafe fn _mm_mask_shrdi_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi32&expand=5098)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shrdi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    assert!(imm8 >= 0 && imm8 <= 255);
-    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
+#[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shrdi_epi32<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(IMM8).as_i32x4());
     let zero = _mm_setzero_si128().as_i32x4();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -1755,14 +1807,16 @@ pub unsafe fn _mm_maskz_shrdi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi16&expand=5096)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_shrdi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_shrdi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     assert!(imm8 >= 0 && imm8 <= 255);
     transmute(vpshrdvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     ))
 }
 
@@ -1771,20 +1825,21 @@ pub unsafe fn _mm512_shrdi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi16&expand=5094)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_shrdi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_shrdi_epi16<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     assert!(imm8 >= 0 && imm8 <= 255);
     let shf: i16x32 = vpshrdvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
 }
@@ -1794,14 +1849,20 @@ pub unsafe fn _mm512_mask_shrdi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi16&expand=5095)
 #[inline]
 #[target_feature(enable = "avx512vbmi2")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_shrdi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     assert!(imm8 >= 0 && imm8 <= 255);
     let shf: i16x32 = vpshrdvw(
         a.as_i16x32(),
         b.as_i16x32(),
-        _mm512_set1_epi16(imm8 as i16).as_i16x32(),
+        _mm512_set1_epi16(imm8).as_i16x32(),
     );
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1812,14 +1873,16 @@ pub unsafe fn _mm512_maskz_shrdi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi16&expand=5093)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_shrdi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_shrdi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     assert!(imm8 >= 0 && imm8 <= 255);
     transmute(vpshrdvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     ))
 }
 
@@ -1828,20 +1891,21 @@ pub unsafe fn _mm256_shrdi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi16&expand=5091)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm256_mask_shrdi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm256_mask_shrdi_epi16<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     assert!(imm8 >= 0 && imm8 <= 255);
     let shf: i16x16 = vpshrdvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     );
     transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
 }
@@ -1851,13 +1915,19 @@ pub unsafe fn _mm256_mask_shrdi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi16&expand=5092)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_maskz_shrdi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     let shf: i16x16 = vpshrdvw256(
         a.as_i16x16(),
         b.as_i16x16(),
-        _mm256_set1_epi16(imm8 as i16).as_i16x16(),
+        _mm256_set1_epi16(imm8).as_i16x16(),
     );
     let zero = _mm256_setzero_si256().as_i16x16();
     transmute(simd_select_bitmask(k, shf, zero))
@@ -1868,13 +1938,15 @@ pub unsafe fn _mm256_maskz_shrdi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi16&expand=5090)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_shrdi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_shrdi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
     transmute(vpshrdvw128(
         a.as_i16x8(),
         b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
+        _mm_set1_epi16(imm8).as_i16x8(),
     ))
 }
 
@@ -1883,20 +1955,17 @@ pub unsafe fn _mm_shrdi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi16&expand=5088)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_shrdi_epi16(
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm_mask_shrdi_epi16<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    let shf: i16x8 = vpshrdvw128(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
-    );
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshrdvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
     transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
 }
 
@@ -1905,14 +1974,16 @@ pub unsafe fn _mm_mask_shrdi_epi16(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi16&expand=5089)
 #[inline]
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_shrdi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    let shf: i16x8 = vpshrdvw128(
-        a.as_i16x8(),
-        b.as_i16x8(),
-        _mm_set1_epi16(imm8 as i16).as_i16x8(),
-    );
+#[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_maskz_shrdi_epi16<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let imm8 = IMM8 as i16;
+    let shf: i16x8 = vpshrdvw128(a.as_i16x8(), b.as_i16x8(), _mm_set1_epi16(imm8).as_i16x8());
     let zero = _mm_setzero_si128().as_i16x8();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -2921,7 +2992,7 @@ mod tests {
     unsafe fn test_mm512_shldi_epi64() {
         let a = _mm512_set1_epi64(1);
         let b = _mm512_set1_epi64(1 << 63);
-        let r = _mm512_shldi_epi64(a, b, 2);
+        let r = _mm512_shldi_epi64::<2>(a, b);
         let e = _mm512_set1_epi64(6);
         assert_eq_m512i(r, e);
     }
@@ -2930,9 +3001,9 @@ mod tests {
     unsafe fn test_mm512_mask_shldi_epi64() {
         let a = _mm512_set1_epi64(1);
         let b = _mm512_set1_epi64(1 << 63);
-        let r = _mm512_mask_shldi_epi64(a, 0, a, b, 2);
+        let r = _mm512_mask_shldi_epi64::<2>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldi_epi64(a, 0b11111111, a, b, 2);
+        let r = _mm512_mask_shldi_epi64::<2>(a, 0b11111111, a, b);
         let e = _mm512_set1_epi64(6);
         assert_eq_m512i(r, e);
     }
@@ -2941,9 +3012,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shldi_epi64() {
         let a = _mm512_set1_epi64(1);
         let b = _mm512_set1_epi64(1 << 63);
-        let r = _mm512_maskz_shldi_epi64(0, a, b, 2);
+        let r = _mm512_maskz_shldi_epi64::<2>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldi_epi64(0b11111111, a, b, 2);
+        let r = _mm512_maskz_shldi_epi64::<2>(0b11111111, a, b);
         let e = _mm512_set1_epi64(6);
         assert_eq_m512i(r, e);
     }
@@ -2952,7 +3023,7 @@ mod tests {
     unsafe fn test_mm256_shldi_epi64() {
         let a = _mm256_set1_epi64x(1);
         let b = _mm256_set1_epi64x(1 << 63);
-        let r = _mm256_shldi_epi64(a, b, 2);
+        let r = _mm256_shldi_epi64::<2>(a, b);
         let e = _mm256_set1_epi64x(6);
         assert_eq_m256i(r, e);
     }
@@ -2961,9 +3032,9 @@ mod tests {
     unsafe fn test_mm256_mask_shldi_epi64() {
         let a = _mm256_set1_epi64x(1);
         let b = _mm256_set1_epi64x(1 << 63);
-        let r = _mm256_mask_shldi_epi64(a, 0, a, b, 2);
+        let r = _mm256_mask_shldi_epi64::<2>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldi_epi64(a, 0b00001111, a, b, 2);
+        let r = _mm256_mask_shldi_epi64::<2>(a, 0b00001111, a, b);
         let e = _mm256_set1_epi64x(6);
         assert_eq_m256i(r, e);
     }
@@ -2972,9 +3043,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shldi_epi64() {
         let a = _mm256_set1_epi64x(1);
         let b = _mm256_set1_epi64x(1 << 63);
-        let r = _mm256_maskz_shldi_epi64(0, a, b, 2);
+        let r = _mm256_maskz_shldi_epi64::<2>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldi_epi64(0b00001111, a, b, 2);
+        let r = _mm256_maskz_shldi_epi64::<2>(0b00001111, a, b);
         let e = _mm256_set1_epi64x(6);
         assert_eq_m256i(r, e);
     }
@@ -2983,7 +3054,7 @@ mod tests {
     unsafe fn test_mm_shldi_epi64() {
         let a = _mm_set1_epi64x(1);
         let b = _mm_set1_epi64x(1 << 63);
-        let r = _mm_shldi_epi64(a, b, 2);
+        let r = _mm_shldi_epi64::<2>(a, b);
         let e = _mm_set1_epi64x(6);
         assert_eq_m128i(r, e);
     }
@@ -2992,9 +3063,9 @@ mod tests {
     unsafe fn test_mm_mask_shldi_epi64() {
         let a = _mm_set1_epi64x(1);
         let b = _mm_set1_epi64x(1 << 63);
-        let r = _mm_mask_shldi_epi64(a, 0, a, b, 2);
+        let r = _mm_mask_shldi_epi64::<2>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shldi_epi64(a, 0b00000011, a, b, 2);
+        let r = _mm_mask_shldi_epi64::<2>(a, 0b00000011, a, b);
         let e = _mm_set1_epi64x(6);
         assert_eq_m128i(r, e);
     }
@@ -3003,9 +3074,9 @@ mod tests {
     unsafe fn test_mm_maskz_shldi_epi64() {
         let a = _mm_set1_epi64x(1);
         let b = _mm_set1_epi64x(1 << 63);
-        let r = _mm_maskz_shldi_epi64(0, a, b, 2);
+        let r = _mm_maskz_shldi_epi64::<2>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldi_epi64(0b00000011, a, b, 2);
+        let r = _mm_maskz_shldi_epi64::<2>(0b00000011, a, b);
         let e = _mm_set1_epi64x(6);
         assert_eq_m128i(r, e);
     }
@@ -3014,7 +3085,7 @@ mod tests {
     unsafe fn test_mm512_shldi_epi32() {
         let a = _mm512_set1_epi32(1);
         let b = _mm512_set1_epi32(1 << 31);
-        let r = _mm512_shldi_epi32(a, b, 2);
+        let r = _mm512_shldi_epi32::<2>(a, b);
         let e = _mm512_set1_epi32(6);
         assert_eq_m512i(r, e);
     }
@@ -3023,9 +3094,9 @@ mod tests {
     unsafe fn test_mm512_mask_shldi_epi32() {
         let a = _mm512_set1_epi32(1);
         let b = _mm512_set1_epi32(1 << 31);
-        let r = _mm512_mask_shldi_epi32(a, 0, a, b, 2);
+        let r = _mm512_mask_shldi_epi32::<2>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldi_epi32(a, 0b11111111_11111111, a, b, 2);
+        let r = _mm512_mask_shldi_epi32::<2>(a, 0b11111111_11111111, a, b);
         let e = _mm512_set1_epi32(6);
         assert_eq_m512i(r, e);
     }
@@ -3034,9 +3105,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shldi_epi32() {
         let a = _mm512_set1_epi32(1);
         let b = _mm512_set1_epi32(1 << 31);
-        let r = _mm512_maskz_shldi_epi32(0, a, b, 2);
+        let r = _mm512_maskz_shldi_epi32::<2>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldi_epi32(0b11111111_11111111, a, b, 2);
+        let r = _mm512_maskz_shldi_epi32::<2>(0b11111111_11111111, a, b);
         let e = _mm512_set1_epi32(6);
         assert_eq_m512i(r, e);
     }
@@ -3045,7 +3116,7 @@ mod tests {
     unsafe fn test_mm256_shldi_epi32() {
         let a = _mm256_set1_epi32(1);
         let b = _mm256_set1_epi32(1 << 31);
-        let r = _mm256_shldi_epi32(a, b, 2);
+        let r = _mm256_shldi_epi32::<2>(a, b);
         let e = _mm256_set1_epi32(6);
         assert_eq_m256i(r, e);
     }
@@ -3054,9 +3125,9 @@ mod tests {
     unsafe fn test_mm256_mask_shldi_epi32() {
         let a = _mm256_set1_epi32(1);
         let b = _mm256_set1_epi32(1 << 31);
-        let r = _mm256_mask_shldi_epi32(a, 0, a, b, 2);
+        let r = _mm256_mask_shldi_epi32::<2>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldi_epi32(a, 0b11111111, a, b, 2);
+        let r = _mm256_mask_shldi_epi32::<2>(a, 0b11111111, a, b);
         let e = _mm256_set1_epi32(6);
         assert_eq_m256i(r, e);
     }
@@ -3065,9 +3136,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shldi_epi32() {
         let a = _mm256_set1_epi32(1);
         let b = _mm256_set1_epi32(1 << 31);
-        let r = _mm256_maskz_shldi_epi32(0, a, b, 2);
+        let r = _mm256_maskz_shldi_epi32::<2>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldi_epi32(0b11111111, a, b, 2);
+        let r = _mm256_maskz_shldi_epi32::<2>(0b11111111, a, b);
         let e = _mm256_set1_epi32(6);
         assert_eq_m256i(r, e);
     }
@@ -3076,7 +3147,7 @@ mod tests {
     unsafe fn test_mm_shldi_epi32() {
         let a = _mm_set1_epi32(1);
         let b = _mm_set1_epi32(1 << 31);
-        let r = _mm_shldi_epi32(a, b, 2);
+        let r = _mm_shldi_epi32::<2>(a, b);
         let e = _mm_set1_epi32(6);
         assert_eq_m128i(r, e);
     }
@@ -3085,9 +3156,9 @@ mod tests {
     unsafe fn test_mm_mask_shldi_epi32() {
         let a = _mm_set1_epi32(1);
         let b = _mm_set1_epi32(1 << 31);
-        let r = _mm_mask_shldi_epi32(a, 0, a, b, 2);
+        let r = _mm_mask_shldi_epi32::<2>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shldi_epi32(a, 0b00001111, a, b, 2);
+        let r = _mm_mask_shldi_epi32::<2>(a, 0b00001111, a, b);
         let e = _mm_set1_epi32(6);
         assert_eq_m128i(r, e);
     }
@@ -3096,9 +3167,9 @@ mod tests {
     unsafe fn test_mm_maskz_shldi_epi32() {
         let a = _mm_set1_epi32(1);
         let b = _mm_set1_epi32(1 << 31);
-        let r = _mm_maskz_shldi_epi32(0, a, b, 2);
+        let r = _mm_maskz_shldi_epi32::<2>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldi_epi32(0b00001111, a, b, 2);
+        let r = _mm_maskz_shldi_epi32::<2>(0b00001111, a, b);
         let e = _mm_set1_epi32(6);
         assert_eq_m128i(r, e);
     }
@@ -3107,7 +3178,7 @@ mod tests {
     unsafe fn test_mm512_shldi_epi16() {
         let a = _mm512_set1_epi16(1);
         let b = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_shldi_epi16(a, b, 2);
+        let r = _mm512_shldi_epi16::<2>(a, b);
         let e = _mm512_set1_epi16(6);
         assert_eq_m512i(r, e);
     }
@@ -3116,9 +3187,9 @@ mod tests {
     unsafe fn test_mm512_mask_shldi_epi16() {
         let a = _mm512_set1_epi16(1);
         let b = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_mask_shldi_epi16(a, 0, a, b, 2);
+        let r = _mm512_mask_shldi_epi16::<2>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shldi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b, 2);
+        let r = _mm512_mask_shldi_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a, b);
         let e = _mm512_set1_epi16(6);
         assert_eq_m512i(r, e);
     }
@@ -3127,9 +3198,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shldi_epi16() {
         let a = _mm512_set1_epi16(1);
         let b = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_maskz_shldi_epi16(0, a, b, 2);
+        let r = _mm512_maskz_shldi_epi16::<2>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shldi_epi16(0b11111111_11111111_11111111_11111111, a, b, 2);
+        let r = _mm512_maskz_shldi_epi16::<2>(0b11111111_11111111_11111111_11111111, a, b);
         let e = _mm512_set1_epi16(6);
         assert_eq_m512i(r, e);
     }
@@ -3138,7 +3209,7 @@ mod tests {
     unsafe fn test_mm256_shldi_epi16() {
         let a = _mm256_set1_epi16(1);
         let b = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_shldi_epi16(a, b, 2);
+        let r = _mm256_shldi_epi16::<2>(a, b);
         let e = _mm256_set1_epi16(6);
         assert_eq_m256i(r, e);
     }
@@ -3147,9 +3218,9 @@ mod tests {
     unsafe fn test_mm256_mask_shldi_epi16() {
         let a = _mm256_set1_epi16(1);
         let b = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_mask_shldi_epi16(a, 0, a, b, 2);
+        let r = _mm256_mask_shldi_epi16::<2>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shldi_epi16(a, 0b11111111_11111111, a, b, 2);
+        let r = _mm256_mask_shldi_epi16::<2>(a, 0b11111111_11111111, a, b);
         let e = _mm256_set1_epi16(6);
         assert_eq_m256i(r, e);
     }
@@ -3158,9 +3229,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shldi_epi16() {
         let a = _mm256_set1_epi16(1);
         let b = _mm256_set1_epi16(1 << 15);
-        let r = _mm256_maskz_shldi_epi16(0, a, b, 2);
+        let r = _mm256_maskz_shldi_epi16::<2>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shldi_epi16(0b11111111_11111111, a, b, 2);
+        let r = _mm256_maskz_shldi_epi16::<2>(0b11111111_11111111, a, b);
         let e = _mm256_set1_epi16(6);
         assert_eq_m256i(r, e);
     }
@@ -3169,7 +3240,7 @@ mod tests {
     unsafe fn test_mm_shldi_epi16() {
         let a = _mm_set1_epi16(1);
         let b = _mm_set1_epi16(1 << 15);
-        let r = _mm_shldi_epi16(a, b, 2);
+        let r = _mm_shldi_epi16::<2>(a, b);
         let e = _mm_set1_epi16(6);
         assert_eq_m128i(r, e);
     }
@@ -3178,9 +3249,9 @@ mod tests {
     unsafe fn test_mm_mask_shldi_epi16() {
         let a = _mm_set1_epi16(1);
         let b = _mm_set1_epi16(1 << 15);
-        let r = _mm_mask_shldi_epi16(a, 0, a, b, 2);
+        let r = _mm_mask_shldi_epi16::<2>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shldi_epi16(a, 0b11111111, a, b, 2);
+        let r = _mm_mask_shldi_epi16::<2>(a, 0b11111111, a, b);
         let e = _mm_set1_epi16(6);
         assert_eq_m128i(r, e);
     }
@@ -3189,9 +3260,9 @@ mod tests {
     unsafe fn test_mm_maskz_shldi_epi16() {
         let a = _mm_set1_epi16(1);
         let b = _mm_set1_epi16(1 << 15);
-        let r = _mm_maskz_shldi_epi16(0, a, b, 2);
+        let r = _mm_maskz_shldi_epi16::<2>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shldi_epi16(0b11111111, a, b, 2);
+        let r = _mm_maskz_shldi_epi16::<2>(0b11111111, a, b);
         let e = _mm_set1_epi16(6);
         assert_eq_m128i(r, e);
     }
@@ -3200,7 +3271,7 @@ mod tests {
     unsafe fn test_mm512_shrdi_epi64() {
         let a = _mm512_set1_epi64(8);
         let b = _mm512_set1_epi64(2);
-        let r = _mm512_shrdi_epi64(a, b, 1);
+        let r = _mm512_shrdi_epi64::<1>(a, b);
         let e = _mm512_set1_epi64(1);
         assert_eq_m512i(r, e);
     }
@@ -3209,9 +3280,9 @@ mod tests {
     unsafe fn test_mm512_mask_shrdi_epi64() {
         let a = _mm512_set1_epi64(8);
         let b = _mm512_set1_epi64(2);
-        let r = _mm512_mask_shrdi_epi64(a, 0, a, b, 1);
+        let r = _mm512_mask_shrdi_epi64::<1>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdi_epi64(a, 0b11111111, a, b, 1);
+        let r = _mm512_mask_shrdi_epi64::<1>(a, 0b11111111, a, b);
         let e = _mm512_set1_epi64(1);
         assert_eq_m512i(r, e);
     }
@@ -3220,9 +3291,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shrdi_epi64() {
         let a = _mm512_set1_epi64(8);
         let b = _mm512_set1_epi64(2);
-        let r = _mm512_maskz_shrdi_epi64(0, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi64::<1>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdi_epi64(0b11111111, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi64::<1>(0b11111111, a, b);
         let e = _mm512_set1_epi64(1);
         assert_eq_m512i(r, e);
     }
@@ -3231,7 +3302,7 @@ mod tests {
     unsafe fn test_mm256_shrdi_epi64() {
         let a = _mm256_set1_epi64x(8);
         let b = _mm256_set1_epi64x(2);
-        let r = _mm256_shrdi_epi64(a, b, 1);
+        let r = _mm256_shrdi_epi64::<1>(a, b);
         let e = _mm256_set1_epi64x(1);
         assert_eq_m256i(r, e);
     }
@@ -3240,9 +3311,9 @@ mod tests {
     unsafe fn test_mm256_mask_shrdi_epi64() {
         let a = _mm256_set1_epi64x(8);
         let b = _mm256_set1_epi64x(2);
-        let r = _mm256_mask_shrdi_epi64(a, 0, a, b, 1);
+        let r = _mm256_mask_shrdi_epi64::<1>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdi_epi64(a, 0b00001111, a, b, 1);
+        let r = _mm256_mask_shrdi_epi64::<1>(a, 0b00001111, a, b);
         let e = _mm256_set1_epi64x(1);
         assert_eq_m256i(r, e);
     }
@@ -3251,9 +3322,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shrdi_epi64() {
         let a = _mm256_set1_epi64x(8);
         let b = _mm256_set1_epi64x(2);
-        let r = _mm256_maskz_shrdi_epi64(0, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi64::<1>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdi_epi64(0b00001111, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi64::<1>(0b00001111, a, b);
         let e = _mm256_set1_epi64x(1);
         assert_eq_m256i(r, e);
     }
@@ -3262,7 +3333,7 @@ mod tests {
     unsafe fn test_mm_shrdi_epi64() {
         let a = _mm_set1_epi64x(8);
         let b = _mm_set1_epi64x(2);
-        let r = _mm_shrdi_epi64(a, b, 1);
+        let r = _mm_shrdi_epi64::<1>(a, b);
         let e = _mm_set1_epi64x(1);
         assert_eq_m128i(r, e);
     }
@@ -3271,9 +3342,9 @@ mod tests {
     unsafe fn test_mm_mask_shrdi_epi64() {
         let a = _mm_set1_epi64x(8);
         let b = _mm_set1_epi64x(2);
-        let r = _mm_mask_shrdi_epi64(a, 0, a, b, 1);
+        let r = _mm_mask_shrdi_epi64::<1>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdi_epi64(a, 0b00000011, a, b, 1);
+        let r = _mm_mask_shrdi_epi64::<1>(a, 0b00000011, a, b);
         let e = _mm_set1_epi64x(1);
         assert_eq_m128i(r, e);
     }
@@ -3282,9 +3353,9 @@ mod tests {
     unsafe fn test_mm_maskz_shrdi_epi64() {
         let a = _mm_set1_epi64x(8);
         let b = _mm_set1_epi64x(2);
-        let r = _mm_maskz_shrdi_epi64(0, a, b, 1);
+        let r = _mm_maskz_shrdi_epi64::<1>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdi_epi64(0b00000011, a, b, 1);
+        let r = _mm_maskz_shrdi_epi64::<1>(0b00000011, a, b);
         let e = _mm_set1_epi64x(1);
         assert_eq_m128i(r, e);
     }
@@ -3293,7 +3364,7 @@ mod tests {
     unsafe fn test_mm512_shrdi_epi32() {
         let a = _mm512_set1_epi32(8);
         let b = _mm512_set1_epi32(2);
-        let r = _mm512_shrdi_epi32(a, b, 1);
+        let r = _mm512_shrdi_epi32::<1>(a, b);
         let e = _mm512_set1_epi32(1);
         assert_eq_m512i(r, e);
     }
@@ -3302,9 +3373,9 @@ mod tests {
     unsafe fn test_mm512_mask_shrdi_epi32() {
         let a = _mm512_set1_epi32(8);
         let b = _mm512_set1_epi32(2);
-        let r = _mm512_mask_shrdi_epi32(a, 0, a, b, 1);
+        let r = _mm512_mask_shrdi_epi32::<1>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdi_epi32(a, 0b11111111_11111111, a, b, 1);
+        let r = _mm512_mask_shrdi_epi32::<1>(a, 0b11111111_11111111, a, b);
         let e = _mm512_set1_epi32(1);
         assert_eq_m512i(r, e);
     }
@@ -3313,9 +3384,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shrdi_epi32() {
         let a = _mm512_set1_epi32(8);
         let b = _mm512_set1_epi32(2);
-        let r = _mm512_maskz_shrdi_epi32(0, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi32::<1>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdi_epi32(0b11111111_11111111, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi32::<1>(0b11111111_11111111, a, b);
         let e = _mm512_set1_epi32(1);
         assert_eq_m512i(r, e);
     }
@@ -3324,7 +3395,7 @@ mod tests {
     unsafe fn test_mm256_shrdi_epi32() {
         let a = _mm256_set1_epi32(8);
         let b = _mm256_set1_epi32(2);
-        let r = _mm256_shrdi_epi32(a, b, 1);
+        let r = _mm256_shrdi_epi32::<1>(a, b);
         let e = _mm256_set1_epi32(1);
         assert_eq_m256i(r, e);
     }
@@ -3333,9 +3404,9 @@ mod tests {
     unsafe fn test_mm256_mask_shrdi_epi32() {
         let a = _mm256_set1_epi32(8);
         let b = _mm256_set1_epi32(2);
-        let r = _mm256_mask_shrdi_epi32(a, 0, a, b, 1);
+        let r = _mm256_mask_shrdi_epi32::<1>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdi_epi32(a, 0b11111111, a, b, 1);
+        let r = _mm256_mask_shrdi_epi32::<1>(a, 0b11111111, a, b);
         let e = _mm256_set1_epi32(1);
         assert_eq_m256i(r, e);
     }
@@ -3344,9 +3415,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shrdi_epi32() {
         let a = _mm256_set1_epi32(8);
         let b = _mm256_set1_epi32(2);
-        let r = _mm256_maskz_shrdi_epi32(0, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi32::<1>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdi_epi32(0b11111111, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi32::<1>(0b11111111, a, b);
         let e = _mm256_set1_epi32(1);
         assert_eq_m256i(r, e);
     }
@@ -3355,7 +3426,7 @@ mod tests {
     unsafe fn test_mm_shrdi_epi32() {
         let a = _mm_set1_epi32(8);
         let b = _mm_set1_epi32(2);
-        let r = _mm_shrdi_epi32(a, b, 1);
+        let r = _mm_shrdi_epi32::<1>(a, b);
         let e = _mm_set1_epi32(1);
         assert_eq_m128i(r, e);
     }
@@ -3364,9 +3435,9 @@ mod tests {
     unsafe fn test_mm_mask_shrdi_epi32() {
         let a = _mm_set1_epi32(8);
         let b = _mm_set1_epi32(2);
-        let r = _mm_mask_shrdi_epi32(a, 0, a, b, 1);
+        let r = _mm_mask_shrdi_epi32::<1>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdi_epi32(a, 0b00001111, a, b, 1);
+        let r = _mm_mask_shrdi_epi32::<1>(a, 0b00001111, a, b);
         let e = _mm_set1_epi32(1);
         assert_eq_m128i(r, e);
     }
@@ -3375,9 +3446,9 @@ mod tests {
     unsafe fn test_mm_maskz_shrdi_epi32() {
         let a = _mm_set1_epi32(8);
         let b = _mm_set1_epi32(2);
-        let r = _mm_maskz_shrdi_epi32(0, a, b, 1);
+        let r = _mm_maskz_shrdi_epi32::<1>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdi_epi32(0b00001111, a, b, 1);
+        let r = _mm_maskz_shrdi_epi32::<1>(0b00001111, a, b);
         let e = _mm_set1_epi32(1);
         assert_eq_m128i(r, e);
     }
@@ -3386,7 +3457,7 @@ mod tests {
     unsafe fn test_mm512_shrdi_epi16() {
         let a = _mm512_set1_epi16(8);
         let b = _mm512_set1_epi16(2);
-        let r = _mm512_shrdi_epi16(a, b, 1);
+        let r = _mm512_shrdi_epi16::<1>(a, b);
         let e = _mm512_set1_epi16(1);
         assert_eq_m512i(r, e);
     }
@@ -3395,9 +3466,9 @@ mod tests {
     unsafe fn test_mm512_mask_shrdi_epi16() {
         let a = _mm512_set1_epi16(8);
         let b = _mm512_set1_epi16(2);
-        let r = _mm512_mask_shrdi_epi16(a, 0, a, b, 1);
+        let r = _mm512_mask_shrdi_epi16::<1>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_shrdi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b, 1);
+        let r = _mm512_mask_shrdi_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a, b);
         let e = _mm512_set1_epi16(1);
         assert_eq_m512i(r, e);
     }
@@ -3406,9 +3477,9 @@ mod tests {
     unsafe fn test_mm512_maskz_shrdi_epi16() {
         let a = _mm512_set1_epi16(8);
         let b = _mm512_set1_epi16(2);
-        let r = _mm512_maskz_shrdi_epi16(0, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi16::<1>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_shrdi_epi16(0b11111111_11111111_11111111_11111111, a, b, 1);
+        let r = _mm512_maskz_shrdi_epi16::<1>(0b11111111_11111111_11111111_11111111, a, b);
         let e = _mm512_set1_epi16(1);
         assert_eq_m512i(r, e);
     }
@@ -3417,7 +3488,7 @@ mod tests {
     unsafe fn test_mm256_shrdi_epi16() {
         let a = _mm256_set1_epi16(8);
         let b = _mm256_set1_epi16(2);
-        let r = _mm256_shrdi_epi16(a, b, 1);
+        let r = _mm256_shrdi_epi16::<1>(a, b);
         let e = _mm256_set1_epi16(1);
         assert_eq_m256i(r, e);
     }
@@ -3426,9 +3497,9 @@ mod tests {
     unsafe fn test_mm256_mask_shrdi_epi16() {
         let a = _mm256_set1_epi16(8);
         let b = _mm256_set1_epi16(2);
-        let r = _mm256_mask_shrdi_epi16(a, 0, a, b, 1);
+        let r = _mm256_mask_shrdi_epi16::<1>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shrdi_epi16(a, 0b11111111_11111111, a, b, 1);
+        let r = _mm256_mask_shrdi_epi16::<1>(a, 0b11111111_11111111, a, b);
         let e = _mm256_set1_epi16(1);
         assert_eq_m256i(r, e);
     }
@@ -3437,9 +3508,9 @@ mod tests {
     unsafe fn test_mm256_maskz_shrdi_epi16() {
         let a = _mm256_set1_epi16(8);
         let b = _mm256_set1_epi16(2);
-        let r = _mm256_maskz_shrdi_epi16(0, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi16::<1>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shrdi_epi16(0b11111111_11111111, a, b, 1);
+        let r = _mm256_maskz_shrdi_epi16::<1>(0b11111111_11111111, a, b);
         let e = _mm256_set1_epi16(1);
         assert_eq_m256i(r, e);
     }
@@ -3448,7 +3519,7 @@ mod tests {
     unsafe fn test_mm_shrdi_epi16() {
         let a = _mm_set1_epi16(8);
         let b = _mm_set1_epi16(2);
-        let r = _mm_shrdi_epi16(a, b, 1);
+        let r = _mm_shrdi_epi16::<1>(a, b);
         let e = _mm_set1_epi16(1);
         assert_eq_m128i(r, e);
     }
@@ -3457,9 +3528,9 @@ mod tests {
     unsafe fn test_mm_mask_shrdi_epi16() {
         let a = _mm_set1_epi16(8);
         let b = _mm_set1_epi16(2);
-        let r = _mm_mask_shrdi_epi16(a, 0, a, b, 1);
+        let r = _mm_mask_shrdi_epi16::<1>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shrdi_epi16(a, 0b11111111, a, b, 1);
+        let r = _mm_mask_shrdi_epi16::<1>(a, 0b11111111, a, b);
         let e = _mm_set1_epi16(1);
         assert_eq_m128i(r, e);
     }
@@ -3468,9 +3539,9 @@ mod tests {
     unsafe fn test_mm_maskz_shrdi_epi16() {
         let a = _mm_set1_epi16(8);
         let b = _mm_set1_epi16(2);
-        let r = _mm_maskz_shrdi_epi16(0, a, b, 1);
+        let r = _mm_maskz_shrdi_epi16::<1>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shrdi_epi16(0b11111111, a, b, 1);
+        let r = _mm_maskz_shrdi_epi16::<1>(0b11111111, a, b);
         let e = _mm_set1_epi16(1);
         assert_eq_m128i(r, e);
     }

From 59a5d0ea07302c468ef319b5253ea6f4be68a209 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Fri, 5 Mar 2021 00:33:09 +0000
Subject: [PATCH 044/123] ror_epi32,epi64, rol_epi32_epi64, srai_epi32

---
 crates/core_arch/src/x86/avx512f.rs    | 760 +++++++++++--------------
 crates/core_arch/src/x86_64/avx512f.rs |  60 +-
 2 files changed, 360 insertions(+), 460 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 5abe23e093..5e5104b618 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -16624,16 +16624,12 @@ pub unsafe fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi32&expand=4685)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_rol_epi32(a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprold(a, IMM8);
     transmute(r)
 }
 
@@ -16642,17 +16638,17 @@ pub unsafe fn _mm512_rol_epi32(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi32&expand=4683)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_rol_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_rol_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
+    let r = vprold(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16660,18 +16656,14 @@ pub unsafe fn _mm512_mask_rol_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rol_epi32&expand=4684)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_rol_epi32(k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprold(a, IMM8);
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -16679,16 +16671,12 @@ pub unsafe fn _mm512_maskz_rol_epi32(k: __mmask16, a: __m512i, imm8: i32) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rol_epi32&expand=4682)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm256_rol_epi32(a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold256(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprold256(a, IMM8);
     transmute(r)
 }
 
@@ -16697,17 +16685,17 @@ pub unsafe fn _mm256_rol_epi32(a: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rol_epi32&expand=4680)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_rol_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_rol_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold256(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i32x8()))
+    let r = vprold256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16715,18 +16703,14 @@ pub unsafe fn _mm256_mask_rol_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rol_epi32&expand=4681)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_rol_epi32(k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold256(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprold256(a, IMM8);
     let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -16734,16 +16718,12 @@ pub unsafe fn _mm256_maskz_rol_epi32(k: __mmask8, a: __m256i, imm8: i32) -> __m2
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rol_epi32&expand=4679)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_rol_epi32(a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold128(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprold128(a, IMM8);
     transmute(r)
 }
 
@@ -16752,17 +16732,17 @@ pub unsafe fn _mm_rol_epi32(a: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rol_epi32&expand=4677)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_rol_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_rol_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold128(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i32x4()))
+    let r = vprold128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16770,18 +16750,14 @@ pub unsafe fn _mm_mask_rol_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: i3
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rol_epi32&expand=4678)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_rol_epi32(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprold128(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprold128(a, IMM8);
     let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -16789,16 +16765,12 @@ pub unsafe fn _mm_maskz_rol_epi32(k: __mmask8, a: __m128i, imm8: i32) -> __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi32&expand=4721)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_ror_epi32(a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprord(a, IMM8);
     transmute(r)
 }
 
@@ -16807,17 +16779,17 @@ pub unsafe fn _mm512_ror_epi32(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi32&expand=4719)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_ror_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_ror_epi32<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
+    let r = vprord(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16825,18 +16797,14 @@ pub unsafe fn _mm512_mask_ror_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi32&expand=4720)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_ror_epi32(k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprord(a, IMM8);
     let zero = _mm512_setzero_si512().as_i32x16();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -16844,16 +16812,12 @@ pub unsafe fn _mm512_maskz_ror_epi32(k: __mmask16, a: __m512i, imm8: i32) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ror_epi32&expand=4718)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm256_ror_epi32(a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord256(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprord256(a, IMM8);
     transmute(r)
 }
 
@@ -16862,17 +16826,17 @@ pub unsafe fn _mm256_ror_epi32(a: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ror_epi32&expand=4716)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_ror_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_ror_epi32<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord256(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i32x8()))
+    let r = vprord256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16880,18 +16844,14 @@ pub unsafe fn _mm256_mask_ror_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ror_epi32&expand=4717)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_ror_epi32(k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord256(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprord256(a, IMM8);
     let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -16899,16 +16859,12 @@ pub unsafe fn _mm256_maskz_ror_epi32(k: __mmask8, a: __m256i, imm8: i32) -> __m2
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ror_epi32&expand=4715)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_ror_epi32(a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord128(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprord128(a, IMM8);
     transmute(r)
 }
 
@@ -16917,17 +16873,17 @@ pub unsafe fn _mm_ror_epi32(a: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ror_epi32&expand=4713)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_ror_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_ror_epi32<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord128(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i32x4()))
+    let r = vprord128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16935,18 +16891,14 @@ pub unsafe fn _mm_mask_ror_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: i3
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ror_epi32&expand=4714)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_ror_epi32(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i32x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprord128(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprord128(a, IMM8);
     let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -16954,16 +16906,12 @@ pub unsafe fn _mm_maskz_ror_epi32(k: __mmask8, a: __m128i, imm8: i32) -> __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi64&expand=4694)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_rol_epi64(a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprolq(a, IMM8);
     transmute(r)
 }
 
@@ -16972,17 +16920,17 @@ pub unsafe fn _mm512_rol_epi64(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi64&expand=4692)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_rol_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_rol_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
+    let r = vprolq(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -16990,18 +16938,14 @@ pub unsafe fn _mm512_mask_rol_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rol_epi64&expand=4693)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_rol_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprolq(a, IMM8);
     let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -17009,16 +16953,12 @@ pub unsafe fn _mm512_maskz_rol_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m5
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rol_epi64&expand=4691)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm256_rol_epi64(a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq256(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprolq256(a, IMM8);
     transmute(r)
 }
 
@@ -17027,17 +16967,17 @@ pub unsafe fn _mm256_rol_epi64(a: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rol_epi64&expand=4689)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_rol_epi64(src: __m256i, k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_rol_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq256(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i64x4()))
+    let r = vprolq256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17045,18 +16985,14 @@ pub unsafe fn _mm256_mask_rol_epi64(src: __m256i, k: __mmask8, a: __m256i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rol_epi64&expand=4690)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_rol_epi64(k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq256(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprolq256(a, IMM8);
     let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -17064,16 +17000,12 @@ pub unsafe fn _mm256_maskz_rol_epi64(k: __mmask8, a: __m256i, imm8: i32) -> __m2
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rol_epi64&expand=4688)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_rol_epi64(a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq128(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprolq128(a, IMM8);
     transmute(r)
 }
 
@@ -17082,17 +17014,17 @@ pub unsafe fn _mm_rol_epi64(a: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rol_epi64&expand=4686)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_rol_epi64(src: __m128i, k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_rol_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq128(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, rol, src.as_i64x2()))
+    let r = vprolq128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17100,18 +17032,14 @@ pub unsafe fn _mm_mask_rol_epi64(src: __m128i, k: __mmask8, a: __m128i, imm8: i3
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rol_epi64&expand=4687)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_rol_epi64(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprolq128(a, $imm8)
-        };
-    }
-    let rol = constify_imm8_sae!(imm8, call);
+    let r = vprolq128(a, IMM8);
     let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, rol, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -17119,16 +17047,12 @@ pub unsafe fn _mm_maskz_rol_epi64(k: __mmask8, a: __m128i, imm8: i32) -> __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi64&expand=4730)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_ror_epi64(a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprorq(a, IMM8);
     transmute(r)
 }
 
@@ -17137,17 +17061,17 @@ pub unsafe fn _mm512_ror_epi64(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi64&expand=4728)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_ror_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_ror_epi64<const IMM8: i32>(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
+    let r = vprorq(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17155,18 +17079,14 @@ pub unsafe fn _mm512_mask_ror_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi64&expand=4729)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_ror_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x8();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprorq(a, IMM8);
     let zero = _mm512_setzero_si512().as_i64x8();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -17174,16 +17094,12 @@ pub unsafe fn _mm512_maskz_ror_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m5
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ror_epi64&expand=4727)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm256_ror_epi64(a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq256(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprorq256(a, IMM8);
     transmute(r)
 }
 
@@ -17192,17 +17108,17 @@ pub unsafe fn _mm256_ror_epi64(a: __m256i, imm8: i32) -> __m256i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ror_epi64&expand=4725)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_ror_epi64(src: __m256i, k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_ror_epi64<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq256(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i64x4()))
+    let r = vprorq256(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17210,18 +17126,14 @@ pub unsafe fn _mm256_mask_ror_epi64(src: __m256i, k: __mmask8, a: __m256i, imm8:
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ror_epi64&expand=4726)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_ror_epi64(k: __mmask8, a: __m256i, imm8: i32) -> __m256i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x4();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq256(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprorq256(a, IMM8);
     let zero = _mm256_setzero_si256().as_i64x4();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -17229,16 +17141,12 @@ pub unsafe fn _mm256_maskz_ror_epi64(k: __mmask8, a: __m256i, imm8: i32) -> __m2
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ror_epi64&expand=4724)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm_ror_epi64(a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq128(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vprorq128(a, IMM8);
     transmute(r)
 }
 
@@ -17247,17 +17155,17 @@ pub unsafe fn _mm_ror_epi64(a: __m128i, imm8: i32) -> __m128i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ror_epi64&expand=4722)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_ror_epi64(src: __m128i, k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_ror_epi64<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq128(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, ror, src.as_i64x2()))
+    let r = vprorq128(a, IMM8);
+    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17265,18 +17173,14 @@ pub unsafe fn _mm_mask_ror_epi64(src: __m128i, k: __mmask8, a: __m128i, imm8: i3
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ror_epi64&expand=4723)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_ror_epi64(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
+#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_i64x2();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vprorq128(a, $imm8)
-        };
-    }
-    let ror = constify_imm8_sae!(imm8, call);
+    let r = vprorq128(a, IMM8);
     let zero = _mm_setzero_si128().as_i64x2();
-    transmute(simd_select_bitmask(k, ror, zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
@@ -18343,7 +18247,7 @@ pub unsafe fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __
 pub unsafe fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
     static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    let r = vpsraid(a, IMM8);
+    let r = vpsraid512(a, IMM8);
     transmute(r)
 }
 
@@ -18361,7 +18265,7 @@ pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
 ) -> __m512i {
     static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    let r = vpsraid(a, IMM8);
+    let r = vpsraid512(a, IMM8);
     transmute(simd_select_bitmask(k, r, src.as_i32x16()))
 }
 
@@ -18375,7 +18279,7 @@ pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
 pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
     static_assert_imm8u!(IMM8);
     let a = a.as_i32x16();
-    let r = vpsraid(a, IMM8);
+    let r = vpsraid512(a, IMM8);
     let zero = _mm512_setzero_si512().as_i32x16();
     transmute(simd_select_bitmask(k, r, zero))
 }
@@ -18385,16 +18289,16 @@ pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srai_epi32&expand=5431)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_srai_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8: u32) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_srai_epi32::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, shf.as_i32x8(), src.as_i32x8()))
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_srai_epi32<const IMM8: u32>(
+    src: __m256i,
+    k: __mmask8,
+    a: __m256i,
+) -> __m256i {
+    let imm8 = IMM8 as i32;
+    let r = psraid256(a.as_i32x8(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18402,17 +18306,13 @@ pub unsafe fn _mm256_mask_srai_epi32(src: __m256i, k: __mmask8, a: __m256i, imm8
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srai_epi32&expand=5432)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_srai_epi32(k: __mmask8, a: __m256i, imm8: u32) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_srai_epi32::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    let imm8 = IMM8 as i32;
+    let r = psraid256(a.as_i32x8(), imm8);
     let zero = _mm256_setzero_si256().as_i32x8();
-    transmute(simd_select_bitmask(k, shf.as_i32x8(), zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18420,16 +18320,16 @@ pub unsafe fn _mm256_maskz_srai_epi32(k: __mmask8, a: __m256i, imm8: u32) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srai_epi32&expand=5428)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_srai_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: u32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_srai_epi32::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
-    transmute(simd_select_bitmask(k, shf.as_i32x4(), src.as_i32x4()))
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_srai_epi32<const IMM8: u32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    let imm8 = IMM8 as i32;
+    let r = psraid128(a.as_i32x4(), imm8);
+    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18437,17 +18337,13 @@ pub unsafe fn _mm_mask_srai_epi32(src: __m128i, k: __mmask8, a: __m128i, imm8: u
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srai_epi32&expand=5429)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_srai_epi32(k: __mmask8, a: __m128i, imm8: u32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_srai_epi32::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    let imm8 = IMM8 as i32;
+    let r = psraid128(a.as_i32x4(), imm8);
     let zero = _mm_setzero_si128().as_i32x4();
-    transmute(simd_select_bitmask(k, shf.as_i32x4(), zero))
+    transmute(simd_select_bitmask(k, r, zero))
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
@@ -37615,7 +37511,11 @@ extern "C" {
     fn vpsraq128(a: i64x2, count: i64x2) -> i64x2;
 
     #[link_name = "llvm.x86.avx512.psrai.d.512"]
-    fn vpsraid(a: i32x16, imm8: u32) -> i32x16;
+    fn vpsraid512(a: i32x16, imm8: u32) -> i32x16;
+    #[link_name = "llvm.x86.avx2.psrai.d"]
+    fn psraid256(a: i32x8, imm8: i32) -> i32x8;
+    #[link_name = "llvm.x86.sse2.psrai.d"]
+    fn psraid128(a: i32x4, imm8: i32) -> i32x4;
 
     #[link_name = "llvm.x86.avx512.psrai.q.512"]
     fn vpsraiq(a: i64x8, imm8: u32) -> i64x8;
@@ -45648,7 +45548,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_rol_epi32() {
         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm512_rol_epi32(a, 1);
+        let r = _mm512_rol_epi32::<1>(a);
         let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
         assert_eq_m512i(r, e);
     }
@@ -45656,9 +45556,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_rol_epi32() {
         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm512_mask_rol_epi32(a, 0, a, 1);
+        let r = _mm512_mask_rol_epi32::<1>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_rol_epi32(a, 0b11111111_11111111, a, 1);
+        let r = _mm512_mask_rol_epi32::<1>(a, 0b11111111_11111111, a);
         let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
         assert_eq_m512i(r, e);
     }
@@ -45666,9 +45566,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_rol_epi32() {
         let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
-        let r = _mm512_maskz_rol_epi32(0, a, 1);
+        let r = _mm512_maskz_rol_epi32::<1>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_rol_epi32(0b00000000_11111111, a, 1);
+        let r = _mm512_maskz_rol_epi32::<1>(0b00000000_11111111, a);
         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
         assert_eq_m512i(r, e);
     }
@@ -45676,7 +45576,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_rol_epi32() {
         let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_rol_epi32(a, 1);
+        let r = _mm256_rol_epi32::<1>(a);
         let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
         assert_eq_m256i(r, e);
     }
@@ -45684,9 +45584,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_rol_epi32() {
         let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_mask_rol_epi32(a, 0, a, 1);
+        let r = _mm256_mask_rol_epi32::<1>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_rol_epi32(a, 0b11111111, a, 1);
+        let r = _mm256_mask_rol_epi32::<1>(a, 0b11111111, a);
         let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
         assert_eq_m256i(r, e);
     }
@@ -45694,9 +45594,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_rol_epi32() {
         let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
-        let r = _mm256_maskz_rol_epi32(0, a, 1);
+        let r = _mm256_maskz_rol_epi32::<1>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_rol_epi32(0b11111111, a, 1);
+        let r = _mm256_maskz_rol_epi32::<1>(0b11111111, a);
         let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
         assert_eq_m256i(r, e);
     }
@@ -45704,7 +45604,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_rol_epi32() {
         let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_rol_epi32(a, 1);
+        let r = _mm_rol_epi32::<1>(a);
         let e = _mm_set_epi32(1 << 0, 2, 2, 2);
         assert_eq_m128i(r, e);
     }
@@ -45712,9 +45612,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_rol_epi32() {
         let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_mask_rol_epi32(a, 0, a, 1);
+        let r = _mm_mask_rol_epi32::<1>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_rol_epi32(a, 0b00001111, a, 1);
+        let r = _mm_mask_rol_epi32::<1>(a, 0b00001111, a);
         let e = _mm_set_epi32(1 << 0, 2, 2, 2);
         assert_eq_m128i(r, e);
     }
@@ -45722,9 +45622,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_rol_epi32() {
         let a = _mm_set_epi32(1 << 31, 1, 1, 1);
-        let r = _mm_maskz_rol_epi32(0, a, 1);
+        let r = _mm_maskz_rol_epi32::<1>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_rol_epi32(0b00001111, a, 1);
+        let r = _mm_maskz_rol_epi32::<1>(0b00001111, a);
         let e = _mm_set_epi32(1 << 0, 2, 2, 2);
         assert_eq_m128i(r, e);
     }
@@ -45732,7 +45632,7 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_ror_epi32() {
         let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm512_ror_epi32(a, 1);
+        let r = _mm512_ror_epi32::<1>(a);
         let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
         assert_eq_m512i(r, e);
     }
@@ -45740,9 +45640,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_ror_epi32() {
         let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm512_mask_ror_epi32(a, 0, a, 1);
+        let r = _mm512_mask_ror_epi32::<1>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_ror_epi32(a, 0b11111111_11111111, a, 1);
+        let r = _mm512_mask_ror_epi32::<1>(a, 0b11111111_11111111, a);
         let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
         assert_eq_m512i(r, e);
     }
@@ -45750,9 +45650,9 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_maskz_ror_epi32() {
         let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
-        let r = _mm512_maskz_ror_epi32(0, a, 1);
+        let r = _mm512_maskz_ror_epi32::<1>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_ror_epi32(0b00000000_11111111, a, 1);
+        let r = _mm512_maskz_ror_epi32::<1>(0b00000000_11111111, a);
         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
         assert_eq_m512i(r, e);
     }
@@ -45760,7 +45660,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_ror_epi32() {
         let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm256_ror_epi32(a, 1);
+        let r = _mm256_ror_epi32::<1>(a);
         let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
         assert_eq_m256i(r, e);
     }
@@ -45768,9 +45668,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_ror_epi32() {
         let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm256_mask_ror_epi32(a, 0, a, 1);
+        let r = _mm256_mask_ror_epi32::<1>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_ror_epi32(a, 0b11111111, a, 1);
+        let r = _mm256_mask_ror_epi32::<1>(a, 0b11111111, a);
         let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
         assert_eq_m256i(r, e);
     }
@@ -45778,9 +45678,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_ror_epi32() {
         let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
-        let r = _mm256_maskz_ror_epi32(0, a, 1);
+        let r = _mm256_maskz_ror_epi32::<1>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_ror_epi32(0b11111111, a, 1);
+        let r = _mm256_maskz_ror_epi32::<1>(0b11111111, a);
         let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
         assert_eq_m256i(r, e);
     }
@@ -45788,7 +45688,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_ror_epi32() {
         let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let r = _mm_ror_epi32(a, 1);
+        let r = _mm_ror_epi32::<1>(a);
         let e = _mm_set_epi32(1 << 31, 1, 1, 1);
         assert_eq_m128i(r, e);
     }
@@ -45796,9 +45696,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_ror_epi32() {
         let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let r = _mm_mask_ror_epi32(a, 0, a, 1);
+        let r = _mm_mask_ror_epi32::<1>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_ror_epi32(a, 0b00001111, a, 1);
+        let r = _mm_mask_ror_epi32::<1>(a, 0b00001111, a);
         let e = _mm_set_epi32(1 << 31, 1, 1, 1);
         assert_eq_m128i(r, e);
     }
@@ -45806,9 +45706,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_ror_epi32() {
         let a = _mm_set_epi32(1 << 0, 2, 2, 2);
-        let r = _mm_maskz_ror_epi32(0, a, 1);
+        let r = _mm_maskz_ror_epi32::<1>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_ror_epi32(0b00001111, a, 1);
+        let r = _mm_maskz_ror_epi32::<1>(0b00001111, a);
         let e = _mm_set_epi32(1 << 31, 1, 1, 1);
         assert_eq_m128i(r, e);
     }
@@ -46664,9 +46564,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_srai_epi32() {
         let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm256_mask_srai_epi32(a, 0, a, 1);
+        let r = _mm256_mask_srai_epi32::<1>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_srai_epi32(a, 0b11111111, a, 1);
+        let r = _mm256_mask_srai_epi32::<1>(a, 0b11111111, a);
         let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -46674,9 +46574,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_srai_epi32() {
         let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
-        let r = _mm256_maskz_srai_epi32(0, a, 1);
+        let r = _mm256_maskz_srai_epi32::<1>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srai_epi32(0b11111111, a, 1);
+        let r = _mm256_maskz_srai_epi32::<1>(0b11111111, a);
         let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
         assert_eq_m256i(r, e);
     }
@@ -46684,9 +46584,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_srai_epi32() {
         let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let r = _mm_mask_srai_epi32(a, 0, a, 1);
+        let r = _mm_mask_srai_epi32::<1>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_srai_epi32(a, 0b00001111, a, 1);
+        let r = _mm_mask_srai_epi32::<1>(a, 0b00001111, a);
         let e = _mm_set_epi32(1 << 4, 0, 0, 0);
         assert_eq_m128i(r, e);
     }
@@ -46694,9 +46594,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_srai_epi32() {
         let a = _mm_set_epi32(1 << 5, 0, 0, 0);
-        let r = _mm_maskz_srai_epi32(0, a, 1);
+        let r = _mm_maskz_srai_epi32::<1>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srai_epi32(0b00001111, a, 1);
+        let r = _mm_maskz_srai_epi32::<1>(0b00001111, a);
         let e = _mm_set_epi32(1 << 4, 0, 0, 0);
         assert_eq_m128i(r, e);
     }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index af62b2112c..84eab28e34 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -7627,7 +7627,7 @@ mod tests {
             1 << 63, 1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
         );
-        let r = _mm512_rol_epi64(a, 1);
+        let r = _mm512_rol_epi64::<1>(a);
         #[rustfmt::skip]
         let e = _mm512_set_epi64(
             1 << 0, 1 << 33, 1 << 33, 1 << 33,
@@ -7643,9 +7643,9 @@ mod tests {
             1 << 63, 1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
         );
-        let r = _mm512_mask_rol_epi64(a, 0, a, 1);
+        let r = _mm512_mask_rol_epi64::<1>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_rol_epi64(a, 0b11111111, a, 1);
+        let r = _mm512_mask_rol_epi64::<1>(a, 0b11111111, a);
         #[rustfmt::skip]
         let e = _mm512_set_epi64(
             1 << 0,  1 << 33, 1 << 33, 1 << 33,
@@ -7661,9 +7661,9 @@ mod tests {
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 63,
         );
-        let r = _mm512_maskz_rol_epi64(0, a, 1);
+        let r = _mm512_maskz_rol_epi64::<1>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_rol_epi64(0b00001111, a, 1);
+        let r = _mm512_maskz_rol_epi64::<1>(0b00001111, a);
         let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 33, 1 << 33, 1 << 33, 1 << 0);
         assert_eq_m512i(r, e);
     }
@@ -7671,7 +7671,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_rol_epi64() {
         let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_rol_epi64(a, 1);
+        let r = _mm256_rol_epi64::<1>(a);
         let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
         assert_eq_m256i(r, e);
     }
@@ -7679,9 +7679,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_rol_epi64() {
         let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_mask_rol_epi64(a, 0, a, 1);
+        let r = _mm256_mask_rol_epi64::<1>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_rol_epi64(a, 0b00001111, a, 1);
+        let r = _mm256_mask_rol_epi64::<1>(a, 0b00001111, a);
         let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
         assert_eq_m256i(r, e);
     }
@@ -7689,9 +7689,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_rol_epi64() {
         let a = _mm256_set_epi64x(1 << 63, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_maskz_rol_epi64(0, a, 1);
+        let r = _mm256_maskz_rol_epi64::<1>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_rol_epi64(0b00001111, a, 1);
+        let r = _mm256_maskz_rol_epi64::<1>(0b00001111, a);
         let e = _mm256_set_epi64x(1 << 0, 1 << 33, 1 << 33, 1 << 33);
         assert_eq_m256i(r, e);
     }
@@ -7699,7 +7699,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_rol_epi64() {
         let a = _mm_set_epi64x(1 << 63, 1 << 32);
-        let r = _mm_rol_epi64(a, 1);
+        let r = _mm_rol_epi64::<1>(a);
         let e = _mm_set_epi64x(1 << 0, 1 << 33);
         assert_eq_m128i(r, e);
     }
@@ -7707,9 +7707,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_rol_epi64() {
         let a = _mm_set_epi64x(1 << 63, 1 << 32);
-        let r = _mm_mask_rol_epi64(a, 0, a, 1);
+        let r = _mm_mask_rol_epi64::<1>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_rol_epi64(a, 0b00000011, a, 1);
+        let r = _mm_mask_rol_epi64::<1>(a, 0b00000011, a);
         let e = _mm_set_epi64x(1 << 0, 1 << 33);
         assert_eq_m128i(r, e);
     }
@@ -7717,9 +7717,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_rol_epi64() {
         let a = _mm_set_epi64x(1 << 63, 1 << 32);
-        let r = _mm_maskz_rol_epi64(0, a, 1);
+        let r = _mm_maskz_rol_epi64::<1>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_rol_epi64(0b00000011, a, 1);
+        let r = _mm_maskz_rol_epi64::<1>(0b00000011, a);
         let e = _mm_set_epi64x(1 << 0, 1 << 33);
         assert_eq_m128i(r, e);
     }
@@ -7731,7 +7731,7 @@ mod tests {
             1 << 0,  1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
         );
-        let r = _mm512_ror_epi64(a, 1);
+        let r = _mm512_ror_epi64::<1>(a);
         #[rustfmt::skip]
         let e = _mm512_set_epi64(
             1 << 63, 1 << 31, 1 << 31, 1 << 31,
@@ -7747,9 +7747,9 @@ mod tests {
             1 << 0,  1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
         );
-        let r = _mm512_mask_ror_epi64(a, 0, a, 1);
+        let r = _mm512_mask_ror_epi64::<1>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_ror_epi64(a, 0b11111111, a, 1);
+        let r = _mm512_mask_ror_epi64::<1>(a, 0b11111111, a);
         #[rustfmt::skip]
         let e = _mm512_set_epi64(
             1 << 63, 1 << 31, 1 << 31, 1 << 31,
@@ -7765,9 +7765,9 @@ mod tests {
             1 << 32, 1 << 32, 1 << 32, 1 << 32,
             1 << 32, 1 << 32, 1 << 32, 1 << 0,
         );
-        let r = _mm512_maskz_ror_epi64(0, a, 1);
+        let r = _mm512_maskz_ror_epi64::<1>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_ror_epi64(0b00001111, a, 1);
+        let r = _mm512_maskz_ror_epi64::<1>(0b00001111, a);
         let e = _mm512_set_epi64(0, 0, 0, 0, 1 << 31, 1 << 31, 1 << 31, 1 << 63);
         assert_eq_m512i(r, e);
     }
@@ -7775,7 +7775,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_ror_epi64() {
         let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_ror_epi64(a, 1);
+        let r = _mm256_ror_epi64::<1>(a);
         let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
         assert_eq_m256i(r, e);
     }
@@ -7783,9 +7783,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_mask_ror_epi64() {
         let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_mask_ror_epi64(a, 0, a, 1);
+        let r = _mm256_mask_ror_epi64::<1>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_ror_epi64(a, 0b00001111, a, 1);
+        let r = _mm256_mask_ror_epi64::<1>(a, 0b00001111, a);
         let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
         assert_eq_m256i(r, e);
     }
@@ -7793,9 +7793,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm256_maskz_ror_epi64() {
         let a = _mm256_set_epi64x(1 << 0, 1 << 32, 1 << 32, 1 << 32);
-        let r = _mm256_maskz_ror_epi64(0, a, 1);
+        let r = _mm256_maskz_ror_epi64::<1>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_ror_epi64(0b00001111, a, 1);
+        let r = _mm256_maskz_ror_epi64::<1>(0b00001111, a);
         let e = _mm256_set_epi64x(1 << 63, 1 << 31, 1 << 31, 1 << 31);
         assert_eq_m256i(r, e);
     }
@@ -7803,7 +7803,7 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_ror_epi64() {
         let a = _mm_set_epi64x(1 << 0, 1 << 32);
-        let r = _mm_ror_epi64(a, 1);
+        let r = _mm_ror_epi64::<1>(a);
         let e = _mm_set_epi64x(1 << 63, 1 << 31);
         assert_eq_m128i(r, e);
     }
@@ -7811,9 +7811,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_mask_ror_epi64() {
         let a = _mm_set_epi64x(1 << 0, 1 << 32);
-        let r = _mm_mask_ror_epi64(a, 0, a, 1);
+        let r = _mm_mask_ror_epi64::<1>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_ror_epi64(a, 0b00000011, a, 1);
+        let r = _mm_mask_ror_epi64::<1>(a, 0b00000011, a);
         let e = _mm_set_epi64x(1 << 63, 1 << 31);
         assert_eq_m128i(r, e);
     }
@@ -7821,9 +7821,9 @@ mod tests {
     #[simd_test(enable = "avx512f,avx512vl")]
     unsafe fn test_mm_maskz_ror_epi64() {
         let a = _mm_set_epi64x(1 << 0, 1 << 32);
-        let r = _mm_maskz_ror_epi64(0, a, 1);
+        let r = _mm_maskz_ror_epi64::<1>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_ror_epi64(0b00000011, a, 1);
+        let r = _mm_maskz_ror_epi64::<1>(0b00000011, a);
         let e = _mm_set_epi64x(1 << 63, 1 << 31);
         assert_eq_m128i(r, e);
     }

From 2cbe7d9f3ef9815c885674f5434a3c2b12fc3ccd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomasz=20Mi=C4=85sko?= <tomasz.miasko@gmail.com>
Date: Wed, 3 Mar 2021 00:00:00 +0000
Subject: [PATCH 045/123] Convert _mm256_insert_epi64 to const generics

---
 crates/core_arch/src/x86/avx2.rs   |  6 +++---
 crates/core_arch/src/x86/test.rs   |  6 ++++--
 crates/core_arch/src/x86_64/avx.rs | 15 +++++----------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index c98c1d8005..785b0fe9bb 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -4484,7 +4484,7 @@ mod tests {
         let a = _mm256_setr_epi64x(0, 1, 2, 3);
         let b = _mm256_setr_epi64x(3, 2, 2, 0);
         let r = _mm256_cmpeq_epi64(a, b);
-        assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
+        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
     }
 
     #[simd_test(enable = "avx2")]
@@ -4513,10 +4513,10 @@ mod tests {
 
     #[simd_test(enable = "avx2")]
     unsafe fn test_mm256_cmpgt_epi64() {
-        let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
+        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
         let b = _mm256_set1_epi64x(0);
         let r = _mm256_cmpgt_epi64(a, b);
-        assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
+        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
     }
 
     #[simd_test(enable = "avx2")]
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index 0784e37524..9f577972fa 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -104,14 +104,16 @@ mod x86_polyfill {
     }
 
     #[target_feature(enable = "avx2")]
-    pub unsafe fn _mm256_insert_epi64(a: __m256i, val: i64, idx: i32) -> __m256i {
+    #[rustc_legacy_const_generics(2)]
+    pub unsafe fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, val: i64) -> __m256i {
+        static_assert_imm2!(INDEX);
         #[repr(C)]
         union A {
             a: __m256i,
             b: [i64; 4],
         }
         let mut a = A { a };
-        a.b[idx as usize] = val;
+        a.b[INDEX as usize] = val;
         a.a
     }
 }
diff --git a/crates/core_arch/src/x86_64/avx.rs b/crates/core_arch/src/x86_64/avx.rs
index fd82367714..7ba26371c6 100644
--- a/crates/core_arch/src/x86_64/avx.rs
+++ b/crates/core_arch/src/x86_64/avx.rs
@@ -23,18 +23,13 @@ use crate::{
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi64)
 #[inline]
-#[rustc_args_required_const(2)]
+#[rustc_legacy_const_generics(2)]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insert_epi64(a: __m256i, i: i64, index: i32) -> __m256i {
-    let a = a.as_i64x4();
-    match index & 3 {
-        0 => transmute(simd_insert(a, 0, i)),
-        1 => transmute(simd_insert(a, 1, i)),
-        2 => transmute(simd_insert(a, 2, i)),
-        _ => transmute(simd_insert(a, 3, i)),
-    }
+pub unsafe fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, i: i64) -> __m256i {
+    static_assert_imm2!(INDEX);
+    transmute(simd_insert(a.as_i64x4(), INDEX as u32, i))
 }
 
 #[cfg(test)]
@@ -46,7 +41,7 @@ mod tests {
     #[simd_test(enable = "avx")]
     unsafe fn test_mm256_insert_epi64() {
         let a = _mm256_setr_epi64x(1, 2, 3, 4);
-        let r = _mm256_insert_epi64(a, 0, 3);
+        let r = _mm256_insert_epi64::<3>(a, 0);
         let e = _mm256_setr_epi64x(1, 2, 3, 0);
         assert_eq_m256i(r, e);
     }

From ca5fed4042c9958180b7e4d609697e827e3f54b3 Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Thu, 4 Mar 2021 21:21:46 +0800
Subject: [PATCH 046/123] Modify stdarch-gen to generate instructions with a
 single parameter and add vceqz instructions

---
 .../core_arch/src/aarch64/neon/generated.rs   | 336 ++++++++++++++++--
 crates/core_arch/src/arm/neon/generated.rs    |  96 ++---
 crates/core_arch/src/macros.rs                |   4 +-
 crates/stdarch-gen/neon.spec                  |  20 ++
 crates/stdarch-gen/src/main.rs                | 116 ++++--
 5 files changed, 472 insertions(+), 100 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index db76c8721d..a74aa578d8 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -73,6 +73,150 @@ pub unsafe fn vceqq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
     simd_eq(a, b)
 }
 
+/// Signed Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqz_s8(a: int8x8_t) -> uint8x8_t {
+    simd_eq(a, int8x8_t(0, 0, 0, 0, 0, 0, 0, 0))
+}
+
+/// Signed Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqzq_s8(a: int8x16_t) -> uint8x16_t {
+    simd_eq(a, int8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
+}
+
+/// Signed Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqz_s16(a: int16x4_t) -> uint16x4_t {
+    simd_eq(a, int16x4_t(0, 0, 0, 0))
+}
+
+/// Signed Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqzq_s16(a: int16x8_t) -> uint16x8_t {
+    simd_eq(a, int16x8_t(0, 0, 0, 0, 0, 0, 0, 0))
+}
+
+/// Signed Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqz_s32(a: int32x2_t) -> uint32x2_t {
+    simd_eq(a, int32x2_t(0, 0))
+}
+
+/// Signed Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqzq_s32(a: int32x4_t) -> uint32x4_t {
+    simd_eq(a, int32x4_t(0, 0, 0, 0))
+}
+
+/// Signed Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqz_s64(a: int64x1_t) -> uint64x1_t {
+    simd_eq(a, int64x1_t(0))
+}
+
+/// Signed Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqzq_s64(a: int64x2_t) -> uint64x2_t {
+    simd_eq(a, int64x2_t(0, 0))
+}
+
+/// Signed Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqz_p64(a: poly64x1_t) -> uint64x1_t {
+    simd_eq(a, poly64x1_t(0))
+}
+
+/// Signed Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqzq_p64(a: poly64x2_t) -> uint64x2_t {
+    simd_eq(a, poly64x2_t(0, 0))
+}
+
+/// Unsigned Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqz_u8(a: uint8x8_t) -> uint8x8_t {
+    simd_eq(a, uint8x8_t(0, 0, 0, 0, 0, 0, 0, 0))
+}
+
+/// Unsigned Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqzq_u8(a: uint8x16_t) -> uint8x16_t {
+    simd_eq(a, uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
+}
+
+/// Unsigned Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqz_u16(a: uint16x4_t) -> uint16x4_t {
+    simd_eq(a, uint16x4_t(0, 0, 0, 0))
+}
+
+/// Unsigned Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqzq_u16(a: uint16x8_t) -> uint16x8_t {
+    simd_eq(a, uint16x8_t(0, 0, 0, 0, 0, 0, 0, 0))
+}
+
+/// Unsigned Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqz_u32(a: uint32x2_t) -> uint32x2_t {
+    simd_eq(a, uint32x2_t(0, 0))
+}
+
+/// Unsigned Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqzq_u32(a: uint32x4_t) -> uint32x4_t {
+    simd_eq(a, uint32x4_t(0, 0, 0, 0))
+}
+
+/// Unsigned Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqz_u64(a: uint64x1_t) -> uint64x1_t {
+    simd_eq(a, uint64x1_t(0))
+}
+
+/// Unsigned Compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(vceqz))]
+pub unsafe fn vceqzq_u64(a: uint64x2_t) -> uint64x2_t {
+    simd_eq(a, uint64x2_t(0, 0))
+}
+
 /// Compare signed greater than
 #[inline]
 #[target_feature(enable = "neon")]
@@ -358,14 +502,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceq_u64() {
-        let a: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
-        let b: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let a: u64x1 = u64x1::new(0);
+        let b: u64x1 = u64x1::new(0);
         let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
         let r: u64x1 = transmute(vceq_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
-        let b: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let a: u64x1 = u64x1::new(0);
+        let b: u64x1 = u64x1::new(0);
         let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
         let r: u64x1 = transmute(vceq_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -373,14 +517,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceqq_u64() {
-        let a: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0x01);
-        let b: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0x01);
+        let a: u64x2 = u64x2::new(0, 0x01);
+        let b: u64x2 = u64x2::new(0, 0x01);
         let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
         let r: u64x2 = transmute(vceqq_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
-        let b: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let a: u64x2 = u64x2::new(0, 0);
+        let b: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
         let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
         let r: u64x2 = transmute(vceqq_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -388,14 +532,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceq_s64() {
-        let a: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
-        let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let b: i64x1 = i64x1::new(-9223372036854775808);
         let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
         let r: u64x1 = transmute(vceq_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
-        let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let b: i64x1 = i64x1::new(-9223372036854775808);
         let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
         let r: u64x1 = transmute(vceq_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -403,14 +547,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceqq_s64() {
-        let a: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 0x01);
-        let b: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 0x01);
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0x01);
+        let b: i64x2 = i64x2::new(-9223372036854775808, 0x01);
         let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
         let r: u64x2 = transmute(vceqq_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 0x7F_FF_FF_FF_FF_FF_FF_FF);
-        let b: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, -9223372036854775808);
+        let a: i64x2 = i64x2::new(-9223372036854775808, -9223372036854775808);
+        let b: i64x2 = i64x2::new(-9223372036854775808, 0x7F_FF_FF_FF_FF_FF_FF_FF);
         let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
         let r: u64x2 = transmute(vceqq_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -418,14 +562,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceq_p64() {
-        let a: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
-        let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let b: i64x1 = i64x1::new(-9223372036854775808);
         let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
         let r: u64x1 = transmute(vceq_p64(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
-        let b: i64x1 = i64x1::new(0x7F_FF_FF_FF_FF_FF_FF_FF);
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let b: i64x1 = i64x1::new(-9223372036854775808);
         let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
         let r: u64x1 = transmute(vceq_p64(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -433,14 +577,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceqq_p64() {
-        let a: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 0x01);
-        let b: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 0x01);
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0x01);
+        let b: i64x2 = i64x2::new(-9223372036854775808, 0x01);
         let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
         let r: u64x2 = transmute(vceqq_p64(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, 0x7F_FF_FF_FF_FF_FF_FF_FF);
-        let b: i64x2 = i64x2::new(0x7F_FF_FF_FF_FF_FF_FF_FF, -9223372036854775808);
+        let a: i64x2 = i64x2::new(-9223372036854775808, -9223372036854775808);
+        let b: i64x2 = i64x2::new(-9223372036854775808, 0x7F_FF_FF_FF_FF_FF_FF_FF);
         let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
         let r: u64x2 = transmute(vceqq_p64(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -464,6 +608,150 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_s8() {
+        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0, 0xFF, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vceqz_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let e: u8x16 = u8x16::new(0, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vceqzq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0, 0xFF_FF, 0, 0);
+        let r: u16x4 = transmute(vceqz_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0, 0xFF_FF, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vceqzq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0x00);
+        let e: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceqz_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
+        let e: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0, 0);
+        let r: u32x4 = transmute(vceqzq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_s64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vceqz_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_s64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0x00);
+        let e: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vceqzq_s64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_p64() {
+        let a: i64x1 = i64x1::new(-9223372036854775808);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vceqz_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_p64() {
+        let a: i64x2 = i64x2::new(-9223372036854775808, 0x00);
+        let e: u64x2 = u64x2::new(0, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vceqzq_p64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_u8() {
+        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0, 0, 0, 0, 0, 0);
+        let r: u8x8 = transmute(vceqz_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        let r: u8x16 = transmute(vceqzq_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_u16() {
+        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0, 0);
+        let r: u16x4 = transmute(vceqz_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0, 0, 0, 0, 0, 0);
+        let r: u16x8 = transmute(vceqzq_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_u32() {
+        let a: u32x2 = u32x2::new(0, 0x00);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceqz_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0, 0);
+        let r: u32x4 = transmute(vceqzq_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceqz_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_u64() {
+        let a: u64x2 = u64x2::new(0, 0x00);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x2 = transmute(vceqzq_u64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vcgt_s64() {
         let a: i64x1 = i64x1::new(1);
diff --git a/crates/core_arch/src/arm/neon/generated.rs b/crates/core_arch/src/arm/neon/generated.rs
index c60ad9cc50..da8fcefcf6 100644
--- a/crates/core_arch/src/arm/neon/generated.rs
+++ b/crates/core_arch/src/arm/neon/generated.rs
@@ -3455,14 +3455,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceq_u8() {
-        let a: u8x8 = u8x8::new(0xFF, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0xFF, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let a: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
         let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
         let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u8x8 = u8x8::new(0xFF, 0xFF, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0xFF, 0, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let a: u8x8 = u8x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
         let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
         let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3470,14 +3470,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceqq_u8() {
-        let a: u8x16 = u8x16::new(0xFF, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0);
-        let b: u8x16 = u8x16::new(0xFF, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0);
+        let a: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
         let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
         let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u8x16 = u8x16::new(0xFF, 0xFF, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0);
-        let b: u8x16 = u8x16::new(0xFF, 0, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, 0xFF);
+        let a: u8x16 = u8x16::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, 0);
         let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
         let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3485,14 +3485,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceq_u16() {
-        let a: u16x4 = u16x4::new(0xFF_FF, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0xFF_FF, 0x01, 0x02, 0x03);
+        let a: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
         let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
         let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0xFF_FF, 0, 0x02, 0x04);
+        let a: u16x4 = u16x4::new(0, 0, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0, 0xFF_FF, 0x02, 0x04);
         let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
         let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3500,14 +3500,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceqq_u16() {
-        let a: u16x8 = u16x8::new(0xFF_FF, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0xFF_FF, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let a: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
         let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
         let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0xFF_FF, 0, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let a: u16x8 = u16x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0, 0xFF_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
         let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
         let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3515,14 +3515,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceq_u32() {
-        let a: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0x01);
-        let b: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0x01);
+        let a: u32x2 = u32x2::new(0, 0x01);
+        let b: u32x2 = u32x2::new(0, 0x01);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
         let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let b: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let a: u32x2 = u32x2::new(0, 0);
+        let b: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
         let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3530,14 +3530,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceqq_u32() {
-        let a: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0x01, 0x02, 0x03);
+        let a: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
         let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0x02, 0x04);
+        let a: u32x4 = u32x4::new(0, 0, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0x02, 0x04);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
         let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3545,14 +3545,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceq_s8() {
-        let a: i8x8 = i8x8::new(0x7F, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(0x7F, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
         let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
         let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i8x8 = i8x8::new(0x7F, 0x7F, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(0x7F, -128, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
         let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
         let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3560,14 +3560,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceqq_s8() {
-        let a: i8x16 = i8x16::new(0x7F, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, -128);
-        let b: i8x16 = i8x16::new(0x7F, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, -128);
+        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
         let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
         let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i8x16 = i8x16::new(0x7F, 0x7F, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, -128);
-        let b: i8x16 = i8x16::new(0x7F, -128, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, 0x7F);
+        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
         let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
         let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3575,14 +3575,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceq_s16() {
-        let a: i16x4 = i16x4::new(0x7F_FF, 0x01, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(0x7F_FF, 0x01, 0x02, 0x03);
+        let a: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
         let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
         let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i16x4 = i16x4::new(0x7F_FF, 0x7F_FF, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(0x7F_FF, -32768, 0x02, 0x04);
+        let a: i16x4 = i16x4::new(-32768, -32768, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(-32768, 0x7F_FF, 0x02, 0x04);
         let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
         let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3590,14 +3590,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceqq_s16() {
-        let a: i16x8 = i16x8::new(0x7F_FF, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(0x7F_FF, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let a: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
         let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
         let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i16x8 = i16x8::new(0x7F_FF, 0x7F_FF, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(0x7F_FF, -32768, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let a: i16x8 = i16x8::new(-32768, -32768, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(-32768, 0x7F_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
         let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
         let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3605,14 +3605,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceq_s32() {
-        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x01);
-        let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x01);
+        let a: i32x2 = i32x2::new(-2147483648, 0x01);
+        let b: i32x2 = i32x2::new(-2147483648, 0x01);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
         let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i32x2 = i32x2::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF);
-        let b: i32x2 = i32x2::new(0x7F_FF_FF_FF, -2147483648);
+        let a: i32x2 = i32x2::new(-2147483648, -2147483648);
+        let b: i32x2 = i32x2::new(-2147483648, 0x7F_FF_FF_FF);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
         let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
@@ -3620,14 +3620,14 @@ mod test {
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vceqq_s32() {
-        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x01, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x01, 0x02, 0x03);
+        let a: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
         let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
 
-        let a: i32x4 = i32x4::new(0x7F_FF_FF_FF, 0x7F_FF_FF_FF, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(0x7F_FF_FF_FF, -2147483648, 0x02, 0x04);
+        let a: i32x4 = i32x4::new(-2147483648, -2147483648, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(-2147483648, 0x7F_FF_FF_FF, 0x02, 0x04);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
         let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
diff --git a/crates/core_arch/src/macros.rs b/crates/core_arch/src/macros.rs
index 87e49fba4b..5d05adfa6a 100644
--- a/crates/core_arch/src/macros.rs
+++ b/crates/core_arch/src/macros.rs
@@ -426,13 +426,13 @@ macro_rules! constify_imm3 {
 macro_rules! types {
     ($(
         $(#[$doc:meta])*
-        pub struct $name:ident($($fields:tt)*);
+        pub struct $name:ident($field:ty$(, $fields:ty)*$(,)?);
     )*) => ($(
         $(#[$doc])*
         #[derive(Copy, Clone, Debug)]
         #[allow(non_camel_case_types)]
         #[repr(simd)]
         #[allow(clippy::missing_inline_in_public_items)]
-        pub struct $name($($fields)*);
+        pub struct $name(pub $field$(, pub $fields)*);
     )*)
 }
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
index 5c705c15db..7805df501b 100644
--- a/crates/stdarch-gen/neon.spec
+++ b/crates/stdarch-gen/neon.spec
@@ -136,6 +136,26 @@ arm = vceq.
 // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
 generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
 
+/// Signed Compare bitwise equal to zero
+name = vceqz
+fn = simd_eq
+a =  MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
+fixed = 0
+validate FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
+
+aarch64 = vceqz
+generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
+
+/// Unsigned Compare bitwise equal to zero
+name = vceqz
+fn = simd_eq
+a =  MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
+fixed = 0
+validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
+
+aarch64 = vceqz
+generate uint*_t, uint64x*_t
+
 ////////////////////
 // greater then
 ////////////////////
diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs
index 6e341c00dd..7498ab93ff 100644
--- a/crates/stdarch-gen/src/main.rs
+++ b/crates/stdarch-gen/src/main.rs
@@ -282,8 +282,8 @@ fn map_val<'v>(t: &str, v: &'v str) -> &'v str {
     match v {
         "FALSE" => false_val(t),
         "TRUE" => true_val(t),
-        "MAX" => min_val(t),
-        "MIN" => max_val(t),
+        "MAX" => max_val(t),
+        "MIN" => min_val(t),
         "FF" => ff_val(t),
         o => o,
     }
@@ -299,6 +299,8 @@ fn gen_aarch64(
     in_t: &str,
     out_t: &str,
     current_tests: &[(Vec<String>, Vec<String>, Vec<String>)],
+    has_b: bool,
+    fixed: &Option<String>,
 ) -> (String, String) {
     let _global_t = type_to_global_type(in_t);
     let _global_ret_t = type_to_global_type(out_t);
@@ -333,20 +335,40 @@ fn gen_aarch64(
     } else {
         String::new()
     };
+    let call = if has_b {
+        format!(
+            r#"pub unsafe fn {}(a: {}, b: {}) -> {} {{
+    {}{}(a, b)
+}}"#,
+            name, in_t, in_t, out_t, ext_c, current_fn,
+        )
+    } else if let Some(fixed_val) = fixed {
+        let mut fixed_vals = fixed_val.clone();
+        for _i in 1..type_len(in_t) {
+            fixed_vals.push_str(", ");
+            fixed_vals.push_str(fixed_val);
+        }
+        format!(
+            r#"pub unsafe fn {}(a: {}) -> {} {{
+    {}{}(a, {}({}))
+}}"#,
+            name, in_t, out_t, ext_c, current_fn, in_t, fixed_vals,
+        )
+    } else {
+        String::new()
+    };
     let function = format!(
         r#"
 {}
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr({}))]
-pub unsafe fn {}(a: {}, b: {}) -> {} {{
-    {}{}(a, b)
-}}
+{}
 "#,
-        current_comment, current_aarch64, name, in_t, in_t, out_t, ext_c, current_fn,
+        current_comment, current_aarch64, call
     );
 
-    let test = gen_test(name, &in_t, &out_t, current_tests, type_len(in_t));
+    let test = gen_test(name, &in_t, &out_t, current_tests, type_len(in_t), has_b);
     (function, test)
 }
 
@@ -356,6 +378,7 @@ fn gen_test(
     out_t: &str,
     current_tests: &[(Vec<String>, Vec<String>, Vec<String>)],
     len: usize,
+    has_b: bool,
 ) -> String {
     let mut test = format!(
         r#"
@@ -367,20 +390,35 @@ fn gen_test(
         let a: Vec<String> = a.iter().take(len).cloned().collect();
         let b: Vec<String> = b.iter().take(len).cloned().collect();
         let e: Vec<String> = e.iter().take(len).cloned().collect();
-        let t = format!(
-            r#"
+        let t = if has_b {
+            format!(
+                r#"
         let a{};
         let b{};
         let e{};
         let r: {} = transmute({}(transmute(a), transmute(b)));
         assert_eq!(r, e);
 "#,
-            values(in_t, &a),
-            values(in_t, &b),
-            values(out_t, &e),
-            type_to_global_type(out_t),
-            name
-        );
+                values(in_t, &a),
+                values(in_t, &b),
+                values(out_t, &e),
+                type_to_global_type(out_t),
+                name
+            )
+        } else {
+            format!(
+                r#"
+        let a{};
+        let e{};
+        let r: {} = transmute({}(transmute(a)));
+        assert_eq!(r, e);
+"#,
+                values(in_t, &a),
+                values(out_t, &e),
+                type_to_global_type(out_t),
+                name
+            )
+        };
         test.push_str(&t);
     }
     test.push_str("    }\n");
@@ -399,6 +437,8 @@ fn gen_arm(
     in_t: &str,
     out_t: &str,
     current_tests: &[(Vec<String>, Vec<String>, Vec<String>)],
+    has_b: bool,
+    fixed: &Option<String>,
 ) -> (String, String) {
     let _global_t = type_to_global_type(in_t);
     let _global_ret_t = type_to_global_type(out_t);
@@ -446,7 +486,28 @@ fn gen_arm(
         } else {
             String::new()
         };
-
+    let call = if has_b {
+        format!(
+            r#"pub unsafe fn {}(a: {}, b: {}) -> {} {{
+    {}{}(a, b)
+}}"#,
+            name, in_t, in_t, out_t, ext_c, current_fn,
+        )
+    } else if let Some(fixed_val) = fixed {
+        let mut fixed_vals = fixed_val.clone();
+        for _i in 1..type_len(in_t) {
+            fixed_vals.push_str(", ");
+            fixed_vals.push_str(fixed_val);
+        }
+        format!(
+            r#"pub unsafe fn {}(a: {}) -> {} {{
+    {}{}(a, {}({}))
+}}"#,
+            name, in_t, out_t, ext_c, current_fn, in_t, fixed_vals,
+        )
+    } else {
+        String::new()
+    };
     let function = format!(
         r#"
 {}
@@ -455,21 +516,14 @@ fn gen_arm(
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr({}))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr({}))]
-pub unsafe fn {}(a: {}, b: {}) -> {} {{
-    {}{}(a, b)
-}}
+{}
 "#,
         current_comment,
         expand_intrinsic(&current_arm, in_t),
         expand_intrinsic(&current_aarch64, in_t),
-        name,
-        in_t,
-        in_t,
-        out_t,
-        ext_c,
-        current_fn,
+        call,
     );
-    let test = gen_test(name, &in_t, &out_t, current_tests, type_len(in_t));
+    let test = gen_test(name, &in_t, &out_t, current_tests, type_len(in_t), has_b);
 
     (function, test)
 }
@@ -558,6 +612,7 @@ fn main() -> io::Result<()> {
     let mut link_aarch64: Option<String> = None;
     let mut a: Vec<String> = Vec::new();
     let mut b: Vec<String> = Vec::new();
+    let mut fixed: Option<String> = None;
     let mut current_tests: Vec<(Vec<String>, Vec<String>, Vec<String>)> = Vec::new();
 
     //
@@ -628,6 +683,9 @@ mod test {
             link_aarch64 = None;
             link_arm = None;
             current_tests = Vec::new();
+            a = Vec::new();
+            b = Vec::new();
+            fixed = None;
         } else if line.starts_with("//") {
         } else if line.starts_with("name = ") {
             current_name = Some(String::from(&line[7..]));
@@ -641,6 +699,8 @@ mod test {
             a = line[4..].split(',').map(|v| v.trim().to_string()).collect();
         } else if line.starts_with("b = ") {
             b = line[4..].split(',').map(|v| v.trim().to_string()).collect();
+        } else if line.starts_with("fixed = ") {
+            fixed = Some(String::from(&line[8..]));
         } else if line.starts_with("validate ") {
             let e = line[9..].split(',').map(|v| v.trim().to_string()).collect();
             current_tests.push((a.clone(), b.clone(), e));
@@ -692,6 +752,8 @@ mod test {
                         &in_t,
                         &out_t,
                         &current_tests,
+                        b.len() > 0,
+                        &fixed,
                     );
                     out_arm.push_str(&function);
                     tests_arm.push_str(&test);
@@ -705,6 +767,8 @@ mod test {
                         &in_t,
                         &out_t,
                         &current_tests,
+                        b.len() > 0,
+                        &fixed,
                     );
                     out_aarch64.push_str(&function);
                     tests_aarch64.push_str(&test);

From 0af94738ef6d67ee6a2e2b8abe10d345d906cb0c Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Thu, 4 Mar 2021 21:43:10 +0800
Subject: [PATCH 047/123] add #[rustfmt::skip] in aarch64/neon/mod.rs

---
 crates/core_arch/src/aarch64/neon/mod.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs
index 383f8a18a6..94865d5dd1 100644
--- a/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/crates/core_arch/src/aarch64/neon/mod.rs
@@ -2,7 +2,9 @@
 
 #![allow(non_camel_case_types)]
 
+#[rustfmt::skip]
 mod generated;
+#[rustfmt::skip]
 pub use self::generated::*;
 
 // FIXME: replace neon with asimd

From 0f524299d45e177549f4d0ef12ba9b7db11a3fa4 Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Thu, 4 Mar 2021 22:29:04 +0800
Subject: [PATCH 048/123] use transmute in vceqz instructions in
 aarch64/neon/generated.rs

---
 .../core_arch/src/aarch64/neon/generated.rs   | 36 ++++++-------
 crates/core_arch/src/macros.rs                |  4 +-
 crates/stdarch-gen/src/main.rs                | 54 ++++++++++++++-----
 3 files changed, 60 insertions(+), 34 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index a74aa578d8..da9bce257b 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -78,7 +78,7 @@ pub unsafe fn vceqq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqz_s8(a: int8x8_t) -> uint8x8_t {
-    simd_eq(a, int8x8_t(0, 0, 0, 0, 0, 0, 0, 0))
+    simd_eq(a, transmute(i8x8::new(0, 0, 0, 0, 0, 0, 0, 0)))
 }
 
 /// Signed Compare bitwise equal to zero
@@ -86,7 +86,7 @@ pub unsafe fn vceqz_s8(a: int8x8_t) -> uint8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqzq_s8(a: int8x16_t) -> uint8x16_t {
-    simd_eq(a, int8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
+    simd_eq(a, transmute(i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)))
 }
 
 /// Signed Compare bitwise equal to zero
@@ -94,7 +94,7 @@ pub unsafe fn vceqzq_s8(a: int8x16_t) -> uint8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqz_s16(a: int16x4_t) -> uint16x4_t {
-    simd_eq(a, int16x4_t(0, 0, 0, 0))
+    simd_eq(a, transmute(i16x4::new(0, 0, 0, 0)))
 }
 
 /// Signed Compare bitwise equal to zero
@@ -102,7 +102,7 @@ pub unsafe fn vceqz_s16(a: int16x4_t) -> uint16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqzq_s16(a: int16x8_t) -> uint16x8_t {
-    simd_eq(a, int16x8_t(0, 0, 0, 0, 0, 0, 0, 0))
+    simd_eq(a, transmute(i16x8::new(0, 0, 0, 0, 0, 0, 0, 0)))
 }
 
 /// Signed Compare bitwise equal to zero
@@ -110,7 +110,7 @@ pub unsafe fn vceqzq_s16(a: int16x8_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqz_s32(a: int32x2_t) -> uint32x2_t {
-    simd_eq(a, int32x2_t(0, 0))
+    simd_eq(a, transmute(i32x2::new(0, 0)))
 }
 
 /// Signed Compare bitwise equal to zero
@@ -118,7 +118,7 @@ pub unsafe fn vceqz_s32(a: int32x2_t) -> uint32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqzq_s32(a: int32x4_t) -> uint32x4_t {
-    simd_eq(a, int32x4_t(0, 0, 0, 0))
+    simd_eq(a, transmute(i32x4::new(0, 0, 0, 0)))
 }
 
 /// Signed Compare bitwise equal to zero
@@ -126,7 +126,7 @@ pub unsafe fn vceqzq_s32(a: int32x4_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqz_s64(a: int64x1_t) -> uint64x1_t {
-    simd_eq(a, int64x1_t(0))
+    simd_eq(a, transmute(i64x1::new(0)))
 }
 
 /// Signed Compare bitwise equal to zero
@@ -134,7 +134,7 @@ pub unsafe fn vceqz_s64(a: int64x1_t) -> uint64x1_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqzq_s64(a: int64x2_t) -> uint64x2_t {
-    simd_eq(a, int64x2_t(0, 0))
+    simd_eq(a, transmute(i64x2::new(0, 0)))
 }
 
 /// Signed Compare bitwise equal to zero
@@ -142,7 +142,7 @@ pub unsafe fn vceqzq_s64(a: int64x2_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqz_p64(a: poly64x1_t) -> uint64x1_t {
-    simd_eq(a, poly64x1_t(0))
+    simd_eq(a, transmute(i64x1::new(0)))
 }
 
 /// Signed Compare bitwise equal to zero
@@ -150,7 +150,7 @@ pub unsafe fn vceqz_p64(a: poly64x1_t) -> uint64x1_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqzq_p64(a: poly64x2_t) -> uint64x2_t {
-    simd_eq(a, poly64x2_t(0, 0))
+    simd_eq(a, transmute(i64x2::new(0, 0)))
 }
 
 /// Unsigned Compare bitwise equal to zero
@@ -158,7 +158,7 @@ pub unsafe fn vceqzq_p64(a: poly64x2_t) -> uint64x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqz_u8(a: uint8x8_t) -> uint8x8_t {
-    simd_eq(a, uint8x8_t(0, 0, 0, 0, 0, 0, 0, 0))
+    simd_eq(a, transmute(u8x8::new(0, 0, 0, 0, 0, 0, 0, 0)))
 }
 
 /// Unsigned Compare bitwise equal to zero
@@ -166,7 +166,7 @@ pub unsafe fn vceqz_u8(a: uint8x8_t) -> uint8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqzq_u8(a: uint8x16_t) -> uint8x16_t {
-    simd_eq(a, uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
+    simd_eq(a, transmute(u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)))
 }
 
 /// Unsigned Compare bitwise equal to zero
@@ -174,7 +174,7 @@ pub unsafe fn vceqzq_u8(a: uint8x16_t) -> uint8x16_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqz_u16(a: uint16x4_t) -> uint16x4_t {
-    simd_eq(a, uint16x4_t(0, 0, 0, 0))
+    simd_eq(a, transmute(u16x4::new(0, 0, 0, 0)))
 }
 
 /// Unsigned Compare bitwise equal to zero
@@ -182,7 +182,7 @@ pub unsafe fn vceqz_u16(a: uint16x4_t) -> uint16x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqzq_u16(a: uint16x8_t) -> uint16x8_t {
-    simd_eq(a, uint16x8_t(0, 0, 0, 0, 0, 0, 0, 0))
+    simd_eq(a, transmute(u16x8::new(0, 0, 0, 0, 0, 0, 0, 0)))
 }
 
 /// Unsigned Compare bitwise equal to zero
@@ -190,7 +190,7 @@ pub unsafe fn vceqzq_u16(a: uint16x8_t) -> uint16x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqz_u32(a: uint32x2_t) -> uint32x2_t {
-    simd_eq(a, uint32x2_t(0, 0))
+    simd_eq(a, transmute(u32x2::new(0, 0)))
 }
 
 /// Unsigned Compare bitwise equal to zero
@@ -198,7 +198,7 @@ pub unsafe fn vceqz_u32(a: uint32x2_t) -> uint32x2_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqzq_u32(a: uint32x4_t) -> uint32x4_t {
-    simd_eq(a, uint32x4_t(0, 0, 0, 0))
+    simd_eq(a, transmute(u32x4::new(0, 0, 0, 0)))
 }
 
 /// Unsigned Compare bitwise equal to zero
@@ -206,7 +206,7 @@ pub unsafe fn vceqzq_u32(a: uint32x4_t) -> uint32x4_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqz_u64(a: uint64x1_t) -> uint64x1_t {
-    simd_eq(a, uint64x1_t(0))
+    simd_eq(a, transmute(u64x1::new(0)))
 }
 
 /// Unsigned Compare bitwise equal to zero
@@ -214,7 +214,7 @@ pub unsafe fn vceqz_u64(a: uint64x1_t) -> uint64x1_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(vceqz))]
 pub unsafe fn vceqzq_u64(a: uint64x2_t) -> uint64x2_t {
-    simd_eq(a, uint64x2_t(0, 0))
+    simd_eq(a, transmute(u64x2::new(0, 0)))
 }
 
 /// Compare signed greater than
diff --git a/crates/core_arch/src/macros.rs b/crates/core_arch/src/macros.rs
index 5d05adfa6a..87e49fba4b 100644
--- a/crates/core_arch/src/macros.rs
+++ b/crates/core_arch/src/macros.rs
@@ -426,13 +426,13 @@ macro_rules! constify_imm3 {
 macro_rules! types {
     ($(
         $(#[$doc:meta])*
-        pub struct $name:ident($field:ty$(, $fields:ty)*$(,)?);
+        pub struct $name:ident($($fields:tt)*);
     )*) => ($(
         $(#[$doc])*
         #[derive(Copy, Clone, Debug)]
         #[allow(non_camel_case_types)]
         #[repr(simd)]
         #[allow(clippy::missing_inline_in_public_items)]
-        pub struct $name(pub $field$(, pub $fields)*);
+        pub struct $name($($fields)*);
     )*)
 }
diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs
index 7498ab93ff..39836bd916 100644
--- a/crates/stdarch-gen/src/main.rs
+++ b/crates/stdarch-gen/src/main.rs
@@ -299,7 +299,7 @@ fn gen_aarch64(
     in_t: &str,
     out_t: &str,
     current_tests: &[(Vec<String>, Vec<String>, Vec<String>)],
-    has_b: bool,
+    single_para: bool,
     fixed: &Option<String>,
 ) -> (String, String) {
     let _global_t = type_to_global_type(in_t);
@@ -335,7 +335,7 @@ fn gen_aarch64(
     } else {
         String::new()
     };
-    let call = if has_b {
+    let call = if !single_para {
         format!(
             r#"pub unsafe fn {}(a: {}, b: {}) -> {} {{
     {}{}(a, b)
@@ -350,9 +350,15 @@ fn gen_aarch64(
         }
         format!(
             r#"pub unsafe fn {}(a: {}) -> {} {{
-    {}{}(a, {}({}))
+    {}{}(a, transmute({}::new({})))
 }}"#,
-            name, in_t, out_t, ext_c, current_fn, in_t, fixed_vals,
+            name,
+            in_t,
+            out_t,
+            ext_c,
+            current_fn,
+            type_to_global_type(in_t),
+            fixed_vals,
         )
     } else {
         String::new()
@@ -368,7 +374,14 @@ fn gen_aarch64(
         current_comment, current_aarch64, call
     );
 
-    let test = gen_test(name, &in_t, &out_t, current_tests, type_len(in_t), has_b);
+    let test = gen_test(
+        name,
+        &in_t,
+        &out_t,
+        current_tests,
+        type_len(in_t),
+        single_para,
+    );
     (function, test)
 }
 
@@ -378,7 +391,7 @@ fn gen_test(
     out_t: &str,
     current_tests: &[(Vec<String>, Vec<String>, Vec<String>)],
     len: usize,
-    has_b: bool,
+    single_para: bool,
 ) -> String {
     let mut test = format!(
         r#"
@@ -390,7 +403,7 @@ fn gen_test(
         let a: Vec<String> = a.iter().take(len).cloned().collect();
         let b: Vec<String> = b.iter().take(len).cloned().collect();
         let e: Vec<String> = e.iter().take(len).cloned().collect();
-        let t = if has_b {
+        let t = if !single_para {
             format!(
                 r#"
         let a{};
@@ -437,7 +450,7 @@ fn gen_arm(
     in_t: &str,
     out_t: &str,
     current_tests: &[(Vec<String>, Vec<String>, Vec<String>)],
-    has_b: bool,
+    single_para: bool,
     fixed: &Option<String>,
 ) -> (String, String) {
     let _global_t = type_to_global_type(in_t);
@@ -486,7 +499,7 @@ fn gen_arm(
         } else {
             String::new()
         };
-    let call = if has_b {
+    let call = if !single_para {
         format!(
             r#"pub unsafe fn {}(a: {}, b: {}) -> {} {{
     {}{}(a, b)
@@ -501,9 +514,15 @@ fn gen_arm(
         }
         format!(
             r#"pub unsafe fn {}(a: {}) -> {} {{
-    {}{}(a, {}({}))
+    {}{}(a, transmute({}::new({})))
 }}"#,
-            name, in_t, out_t, ext_c, current_fn, in_t, fixed_vals,
+            name,
+            in_t,
+            out_t,
+            ext_c,
+            current_fn,
+            type_to_global_type(in_t),
+            fixed_vals,
         )
     } else {
         String::new()
@@ -523,7 +542,14 @@ fn gen_arm(
         expand_intrinsic(&current_aarch64, in_t),
         call,
     );
-    let test = gen_test(name, &in_t, &out_t, current_tests, type_len(in_t), has_b);
+    let test = gen_test(
+        name,
+        &in_t,
+        &out_t,
+        current_tests,
+        type_len(in_t),
+        single_para,
+    );
 
     (function, test)
 }
@@ -752,7 +778,7 @@ mod test {
                         &in_t,
                         &out_t,
                         &current_tests,
-                        b.len() > 0,
+                        b.len() == 0,
                         &fixed,
                     );
                     out_arm.push_str(&function);
@@ -767,7 +793,7 @@ mod test {
                         &in_t,
                         &out_t,
                         &current_tests,
-                        b.len() > 0,
+                        b.len() == 0,
                         &fixed,
                     );
                     out_aarch64.push_str(&function);

From 4b331e8a4b6e99ed2748a6cd38c5981f4d31e059 Mon Sep 17 00:00:00 2001
From: SparrowLii <liyuan179@huawei.com>
Date: Thu, 4 Mar 2021 23:54:37 +0800
Subject: [PATCH 049/123] Correct the instruction name and add floating point
 instructions

---
 .../core_arch/src/aarch64/neon/generated.rs   | 194 +++++++++++++-----
 crates/stdarch-gen/neon.spec                  |  22 +-
 crates/stdarch-gen/src/main.rs                |  46 ++---
 3 files changed, 175 insertions(+), 87 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index da9bce257b..f829cc0dcf 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -73,148 +73,202 @@ pub unsafe fn vceqq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t {
     simd_eq(a, b)
 }
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqz_s8(a: int8x8_t) -> uint8x8_t {
-    simd_eq(a, transmute(i8x8::new(0, 0, 0, 0, 0, 0, 0, 0)))
+    let b: i8x8 = i8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqzq_s8(a: int8x16_t) -> uint8x16_t {
-    simd_eq(a, transmute(i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)))
+    let b: i8x16 = i8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqz_s16(a: int16x4_t) -> uint16x4_t {
-    simd_eq(a, transmute(i16x4::new(0, 0, 0, 0)))
+    let b: i16x4 = i16x4::new(0, 0, 0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqzq_s16(a: int16x8_t) -> uint16x8_t {
-    simd_eq(a, transmute(i16x8::new(0, 0, 0, 0, 0, 0, 0, 0)))
+    let b: i16x8 = i16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqz_s32(a: int32x2_t) -> uint32x2_t {
-    simd_eq(a, transmute(i32x2::new(0, 0)))
+    let b: i32x2 = i32x2::new(0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqzq_s32(a: int32x4_t) -> uint32x4_t {
-    simd_eq(a, transmute(i32x4::new(0, 0, 0, 0)))
+    let b: i32x4 = i32x4::new(0, 0, 0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqz_s64(a: int64x1_t) -> uint64x1_t {
-    simd_eq(a, transmute(i64x1::new(0)))
+    let b: i64x1 = i64x1::new(0);
+    simd_eq(a, transmute(b))
 }
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqzq_s64(a: int64x2_t) -> uint64x2_t {
-    simd_eq(a, transmute(i64x2::new(0, 0)))
+    let b: i64x2 = i64x2::new(0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqz_p64(a: poly64x1_t) -> uint64x1_t {
-    simd_eq(a, transmute(i64x1::new(0)))
+    let b: i64x1 = i64x1::new(0);
+    simd_eq(a, transmute(b))
 }
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqzq_p64(a: poly64x2_t) -> uint64x2_t {
-    simd_eq(a, transmute(i64x2::new(0, 0)))
+    let b: i64x2 = i64x2::new(0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Unsigned Compare bitwise equal to zero
+/// Unsigned compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqz_u8(a: uint8x8_t) -> uint8x8_t {
-    simd_eq(a, transmute(u8x8::new(0, 0, 0, 0, 0, 0, 0, 0)))
+    let b: u8x8 = u8x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Unsigned Compare bitwise equal to zero
+/// Unsigned compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqzq_u8(a: uint8x16_t) -> uint8x16_t {
-    simd_eq(a, transmute(u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)))
+    let b: u8x16 = u8x16::new(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Unsigned Compare bitwise equal to zero
+/// Unsigned compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqz_u16(a: uint16x4_t) -> uint16x4_t {
-    simd_eq(a, transmute(u16x4::new(0, 0, 0, 0)))
+    let b: u16x4 = u16x4::new(0, 0, 0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Unsigned Compare bitwise equal to zero
+/// Unsigned compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqzq_u16(a: uint16x8_t) -> uint16x8_t {
-    simd_eq(a, transmute(u16x8::new(0, 0, 0, 0, 0, 0, 0, 0)))
+    let b: u16x8 = u16x8::new(0, 0, 0, 0, 0, 0, 0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Unsigned Compare bitwise equal to zero
+/// Unsigned compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqz_u32(a: uint32x2_t) -> uint32x2_t {
-    simd_eq(a, transmute(u32x2::new(0, 0)))
+    let b: u32x2 = u32x2::new(0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Unsigned Compare bitwise equal to zero
+/// Unsigned compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqzq_u32(a: uint32x4_t) -> uint32x4_t {
-    simd_eq(a, transmute(u32x4::new(0, 0, 0, 0)))
+    let b: u32x4 = u32x4::new(0, 0, 0, 0);
+    simd_eq(a, transmute(b))
 }
 
-/// Unsigned Compare bitwise equal to zero
+/// Unsigned compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqz_u64(a: uint64x1_t) -> uint64x1_t {
-    simd_eq(a, transmute(u64x1::new(0)))
+    let b: u64x1 = u64x1::new(0);
+    simd_eq(a, transmute(b))
 }
 
-/// Unsigned Compare bitwise equal to zero
+/// Unsigned compare bitwise equal to zero
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(vceqz))]
+#[cfg_attr(test, assert_instr(cmeq))]
 pub unsafe fn vceqzq_u64(a: uint64x2_t) -> uint64x2_t {
-    simd_eq(a, transmute(u64x2::new(0, 0)))
+    let b: u64x2 = u64x2::new(0, 0);
+    simd_eq(a, transmute(b))
+}
+
+/// Floating-point compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+pub unsafe fn vceqz_f32(a: float32x2_t) -> uint32x2_t {
+    let b: f32x2 = f32x2::new(0.0, 0.0);
+    simd_eq(a, transmute(b))
+}
+
+/// Floating-point compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+pub unsafe fn vceqzq_f32(a: float32x4_t) -> uint32x4_t {
+    let b: f32x4 = f32x4::new(0.0, 0.0, 0.0, 0.0);
+    simd_eq(a, transmute(b))
+}
+
+/// Floating-point compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+pub unsafe fn vceqz_f64(a: float64x1_t) -> uint64x1_t {
+    let b: f64 = 0.0;
+    simd_eq(a, transmute(b))
+}
+
+/// Floating-point compare bitwise equal to zero
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(fcmeq))]
+pub unsafe fn vceqzq_f64(a: float64x2_t) -> uint64x2_t {
+    let b: f64x2 = f64x2::new(0.0, 0.0);
+    simd_eq(a, transmute(b))
 }
 
 /// Compare signed greater than
@@ -752,6 +806,38 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_f32() {
+        let a: f32x2 = f32x2::new(0.0, 1.2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vceqz_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_f32() {
+        let a: f32x4 = f32x4::new(0.0, 1.2, 3.4, 5.6);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0, 0);
+        let r: u32x4 = transmute(vceqzq_f32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqz_f64() {
+        let a: f64 = 0.0;
+        let e: u64x1 = u64x1::new(0xFF_FF_FF_FF_FF_FF_FF_FF);
+        let r: u64x1 = transmute(vceqz_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vceqzq_f64() {
+        let a: f64x2 = f64x2::new(0.0, 1.2);
+        let e: u64x2 = u64x2::new(0xFF_FF_FF_FF_FF_FF_FF_FF, 0);
+        let r: u64x2 = transmute(vceqzq_f64(transmute(a)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vcgt_s64() {
         let a: i64x1 = i64x1::new(1);
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
index 7805df501b..6c2234b127 100644
--- a/crates/stdarch-gen/neon.spec
+++ b/crates/stdarch-gen/neon.spec
@@ -136,26 +136,36 @@ arm = vceq.
 // we are missing float16x4_t:uint16x4_t, float16x8_t:uint16x8_t
 generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t
 
-/// Signed Compare bitwise equal to zero
+/// Signed compare bitwise equal to zero
 name = vceqz
 fn = simd_eq
 a =  MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
-fixed = 0
+fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 validate FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
 
-aarch64 = vceqz
+aarch64 = cmeq
 generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t
 
-/// Unsigned Compare bitwise equal to zero
+/// Unsigned compare bitwise equal to zero
 name = vceqz
 fn = simd_eq
 a =  MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX
-fixed = 0
+fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
 
-aarch64 = vceqz
+aarch64 = cmeq
 generate uint*_t, uint64x*_t
 
+/// Floating-point compare bitwise equal to zero
+name = vceqz
+fn = simd_eq
+a =  0.0, 1.2, 3.4, 5.6
+fixed = 0.0, 0.0, 0.0, 0.0
+validate TRUE, FALSE, FALSE, FALSE
+
+aarch64 = fcmeq
+generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t
+
 ////////////////////
 // greater then
 ////////////////////
diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs
index 39836bd916..47164a7c8b 100644
--- a/crates/stdarch-gen/src/main.rs
+++ b/crates/stdarch-gen/src/main.rs
@@ -300,7 +300,7 @@ fn gen_aarch64(
     out_t: &str,
     current_tests: &[(Vec<String>, Vec<String>, Vec<String>)],
     single_para: bool,
-    fixed: &Option<String>,
+    fixed: &Vec<String>,
 ) -> (String, String) {
     let _global_t = type_to_global_type(in_t);
     let _global_ret_t = type_to_global_type(out_t);
@@ -342,23 +342,19 @@ fn gen_aarch64(
 }}"#,
             name, in_t, in_t, out_t, ext_c, current_fn,
         )
-    } else if let Some(fixed_val) = fixed {
-        let mut fixed_vals = fixed_val.clone();
-        for _i in 1..type_len(in_t) {
-            fixed_vals.push_str(", ");
-            fixed_vals.push_str(fixed_val);
-        }
+    } else if fixed.len() != 0 {
+        let fixed: Vec<String> = fixed.iter().take(type_len(in_t)).cloned().collect();
         format!(
             r#"pub unsafe fn {}(a: {}) -> {} {{
-    {}{}(a, transmute({}::new({})))
+    let b{};
+    {}{}(a, transmute(b))
 }}"#,
             name,
             in_t,
             out_t,
+            values(in_t, &fixed),
             ext_c,
             current_fn,
-            type_to_global_type(in_t),
-            fixed_vals,
         )
     } else {
         String::new()
@@ -451,7 +447,7 @@ fn gen_arm(
     out_t: &str,
     current_tests: &[(Vec<String>, Vec<String>, Vec<String>)],
     single_para: bool,
-    fixed: &Option<String>,
+    fixed: &Vec<String>,
 ) -> (String, String) {
     let _global_t = type_to_global_type(in_t);
     let _global_ret_t = type_to_global_type(out_t);
@@ -506,23 +502,19 @@ fn gen_arm(
 }}"#,
             name, in_t, in_t, out_t, ext_c, current_fn,
         )
-    } else if let Some(fixed_val) = fixed {
-        let mut fixed_vals = fixed_val.clone();
-        for _i in 1..type_len(in_t) {
-            fixed_vals.push_str(", ");
-            fixed_vals.push_str(fixed_val);
-        }
+    } else if fixed.len() != 0 {
+        let fixed: Vec<String> = fixed.iter().take(type_len(in_t)).cloned().collect();
         format!(
             r#"pub unsafe fn {}(a: {}) -> {} {{
-    {}{}(a, transmute({}::new({})))
+    let b{};
+    {}{}(a, transmute(b))
 }}"#,
             name,
             in_t,
             out_t,
+            values(in_t, &fixed),
             ext_c,
             current_fn,
-            type_to_global_type(in_t),
-            fixed_vals,
         )
     } else {
         String::new()
@@ -638,7 +630,8 @@ fn main() -> io::Result<()> {
     let mut link_aarch64: Option<String> = None;
     let mut a: Vec<String> = Vec::new();
     let mut b: Vec<String> = Vec::new();
-    let mut fixed: Option<String> = None;
+    let mut fixed: Vec<String> = Vec::new();
+    let mut single_para: bool = true;
     let mut current_tests: Vec<(Vec<String>, Vec<String>, Vec<String>)> = Vec::new();
 
     //
@@ -709,9 +702,7 @@ mod test {
             link_aarch64 = None;
             link_arm = None;
             current_tests = Vec::new();
-            a = Vec::new();
-            b = Vec::new();
-            fixed = None;
+            single_para = true;
         } else if line.starts_with("//") {
         } else if line.starts_with("name = ") {
             current_name = Some(String::from(&line[7..]));
@@ -725,8 +716,9 @@ mod test {
             a = line[4..].split(',').map(|v| v.trim().to_string()).collect();
         } else if line.starts_with("b = ") {
             b = line[4..].split(',').map(|v| v.trim().to_string()).collect();
+            single_para = false;
         } else if line.starts_with("fixed = ") {
-            fixed = Some(String::from(&line[8..]));
+            fixed = line[8..].split(',').map(|v| v.trim().to_string()).collect();
         } else if line.starts_with("validate ") {
             let e = line[9..].split(',').map(|v| v.trim().to_string()).collect();
             current_tests.push((a.clone(), b.clone(), e));
@@ -778,7 +770,7 @@ mod test {
                         &in_t,
                         &out_t,
                         &current_tests,
-                        b.len() == 0,
+                        single_para,
                         &fixed,
                     );
                     out_arm.push_str(&function);
@@ -793,7 +785,7 @@ mod test {
                         &in_t,
                         &out_t,
                         &current_tests,
-                        b.len() == 0,
+                        single_para,
                         &fixed,
                     );
                     out_aarch64.push_str(&function);

From e34af114372319103a1856a23d47ee00fafb48a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomasz=20Mi=C4=85sko?= <tomasz.miasko@gmail.com>
Date: Thu, 4 Mar 2021 00:00:00 +0000
Subject: [PATCH 050/123] Convert some AVX intrinsics to const generics

* _mm256_extractf128_ps
* _mm256_extractf128_pd
* _mm256_extractf128_si256
* _mm256_insertf128_ps
* _mm256_insertf128_pd
* _mm256_insertf128_si256
---
 crates/core_arch/src/x86/avx.rs | 119 +++++++++++++++++---------------
 1 file changed, 62 insertions(+), 57 deletions(-)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 53c4a00f42..8f040fc2f5 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -983,15 +983,17 @@ pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
 #[target_feature(enable = "avx")]
 #[cfg_attr(
     all(test, not(target_os = "windows")),
-    assert_instr(vextractf128, imm8 = 1)
+    assert_instr(vextractf128, IMM1 = 1)
 )]
-#[rustc_args_required_const(1)]
+#[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 {
-    match imm8 & 1 {
-        0 => simd_shuffle4(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
-        _ => simd_shuffle4(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
-    }
+pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
+    static_assert_imm1!(IMM1);
+    simd_shuffle4(
+        a,
+        _mm256_undefined_ps(),
+        [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
+    )
 }
 
 /// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
@@ -1002,15 +1004,13 @@ pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(
     all(test, not(target_os = "windows")),
-    assert_instr(vextractf128, imm8 = 1)
+    assert_instr(vextractf128, IMM1 = 1)
 )]
-#[rustc_args_required_const(1)]
+#[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d {
-    match imm8 & 1 {
-        0 => simd_shuffle2(a, _mm256_undefined_pd(), [0, 1]),
-        _ => simd_shuffle2(a, _mm256_undefined_pd(), [2, 3]),
-    }
+pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
+    static_assert_imm1!(IMM1);
+    simd_shuffle2(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize])
 }
 
 /// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
@@ -1020,16 +1020,17 @@ pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(
     all(test, not(target_os = "windows")),
-    assert_instr(vextractf128, imm8 = 1)
+    assert_instr(vextractf128, IMM1 = 1)
 )]
-#[rustc_args_required_const(1)]
+#[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
-    let b = _mm256_undefined_si256().as_i64x4();
-    let dst: i64x2 = match imm8 & 1 {
-        0 => simd_shuffle2(a.as_i64x4(), b, [0, 1]),
-        _ => simd_shuffle2(a.as_i64x4(), b, [2, 3]),
-    };
+pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    static_assert_imm1!(IMM1);
+    let dst: i64x2 = simd_shuffle2(
+        a.as_i64x4(),
+        _mm256_undefined_si256().as_i64x4(),
+        [[0, 1], [2, 3]][IMM1 as usize],
+    );
     transmute(dst)
 }
 
@@ -1410,16 +1411,17 @@ pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
 #[target_feature(enable = "avx")]
 #[cfg_attr(
     all(test, not(target_os = "windows")),
-    assert_instr(vinsertf128, imm8 = 1)
+    assert_instr(vinsertf128, IMM1 = 1)
 )]
-#[rustc_args_required_const(2)]
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
-    let b = _mm256_castps128_ps256(b);
-    match imm8 & 1 {
-        0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
-    }
+pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
+    static_assert_imm1!(IMM1);
+    simd_shuffle8(
+        a,
+        _mm256_castps128_ps256(b),
+        [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
+    )
 }
 
 /// Copies `a` to result, then inserts 128 bits (composed of 2 packed
@@ -1431,15 +1433,17 @@ pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
 #[target_feature(enable = "avx")]
 #[cfg_attr(
     all(test, not(target_os = "windows")),
-    assert_instr(vinsertf128, imm8 = 1)
+    assert_instr(vinsertf128, IMM1 = 1)
 )]
-#[rustc_args_required_const(2)]
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insertf128_pd(a: __m256d, b: __m128d, imm8: i32) -> __m256d {
-    match imm8 & 1 {
-        0 => simd_shuffle4(a, _mm256_castpd128_pd256(b), [4, 5, 2, 3]),
-        _ => simd_shuffle4(a, _mm256_castpd128_pd256(b), [0, 1, 4, 5]),
-    }
+pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
+    static_assert_imm1!(IMM1);
+    simd_shuffle4(
+        a,
+        _mm256_castpd128_pd256(b),
+        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+    )
 }
 
 /// Copies `a` to result, then inserts 128 bits from `b` into result
@@ -1450,16 +1454,17 @@ pub unsafe fn _mm256_insertf128_pd(a: __m256d, b: __m128d, imm8: i32) -> __m256d
 #[target_feature(enable = "avx")]
 #[cfg_attr(
     all(test, not(target_os = "windows")),
-    assert_instr(vinsertf128, imm8 = 1)
+    assert_instr(vinsertf128, IMM1 = 1)
 )]
-#[rustc_args_required_const(2)]
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_insertf128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
-    let b = _mm256_castsi128_si256(b).as_i64x4();
-    let dst: i64x4 = match imm8 & 1 {
-        0 => simd_shuffle4(a.as_i64x4(), b, [4, 5, 2, 3]),
-        _ => simd_shuffle4(a.as_i64x4(), b, [0, 1, 4, 5]),
-    };
+pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    static_assert_imm1!(IMM1);
+    let dst: i64x4 = simd_shuffle4(
+        a.as_i64x4(),
+        _mm256_castsi128_si256(b).as_i64x4(),
+        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+    );
     transmute(dst)
 }
 
@@ -2961,7 +2966,7 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
     let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
-    _mm256_insertf128_ps(a, _mm_loadu_ps(hiaddr), 1)
+    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
 }
 
 /// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
@@ -2976,7 +2981,7 @@ pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m2
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
     let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
-    _mm256_insertf128_pd(a, _mm_loadu_pd(hiaddr), 1)
+    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
 }
 
 /// Loads two 128-bit values (composed of integer data) from memory, and combine
@@ -2990,7 +2995,7 @@ pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
     let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
-    _mm256_insertf128_si256(a, _mm_loadu_si128(hiaddr), 1)
+    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
 }
 
 /// Stores the high and low 128-bit halves (each composed of 4 packed
@@ -3006,7 +3011,7 @@ pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i
 pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
     let lo = _mm256_castps256_ps128(a);
     _mm_storeu_ps(loaddr, lo);
-    let hi = _mm256_extractf128_ps(a, 1);
+    let hi = _mm256_extractf128_ps::<1>(a);
     _mm_storeu_ps(hiaddr, hi);
 }
 
@@ -3023,7 +3028,7 @@ pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256)
 pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
     let lo = _mm256_castpd256_pd128(a);
     _mm_storeu_pd(loaddr, lo);
-    let hi = _mm256_extractf128_pd(a, 1);
+    let hi = _mm256_extractf128_pd::<1>(a);
     _mm_storeu_pd(hiaddr, hi);
 }
 
@@ -3039,7 +3044,7 @@ pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256
 pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
     let lo = _mm256_castsi256_si128(a);
     _mm_storeu_si128(loaddr, lo);
-    let hi = _mm256_extractf128_si256(a, 1);
+    let hi = _mm256_extractf128_si256::<1>(a);
     _mm_storeu_si128(hiaddr, hi);
 }
 
@@ -3727,7 +3732,7 @@ mod tests {
     #[simd_test(enable = "avx")]
     unsafe fn test_mm256_extractf128_ps() {
         let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
-        let r = _mm256_extractf128_ps(a, 0);
+        let r = _mm256_extractf128_ps::<0>(a);
         let e = _mm_setr_ps(4., 3., 2., 5.);
         assert_eq_m128(r, e);
     }
@@ -3735,7 +3740,7 @@ mod tests {
     #[simd_test(enable = "avx")]
     unsafe fn test_mm256_extractf128_pd() {
         let a = _mm256_setr_pd(4., 3., 2., 5.);
-        let r = _mm256_extractf128_pd(a, 0);
+        let r = _mm256_extractf128_pd::<0>(a);
         let e = _mm_setr_pd(4., 3.);
         assert_eq_m128d(r, e);
     }
@@ -3743,7 +3748,7 @@ mod tests {
     #[simd_test(enable = "avx")]
     unsafe fn test_mm256_extractf128_si256() {
         let a = _mm256_setr_epi64x(4, 3, 2, 5);
-        let r = _mm256_extractf128_si256(a, 0);
+        let r = _mm256_extractf128_si256::<0>(a);
         let e = _mm_setr_epi64x(4, 3);
         assert_eq_m128i(r, e);
     }
@@ -3894,7 +3899,7 @@ mod tests {
     unsafe fn test_mm256_insertf128_ps() {
         let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
         let b = _mm_setr_ps(4., 9., 16., 25.);
-        let r = _mm256_insertf128_ps(a, b, 0);
+        let r = _mm256_insertf128_ps::<0>(a, b);
         let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
         assert_eq_m256(r, e);
     }
@@ -3903,7 +3908,7 @@ mod tests {
     unsafe fn test_mm256_insertf128_pd() {
         let a = _mm256_setr_pd(1., 2., 3., 4.);
         let b = _mm_setr_pd(5., 6.);
-        let r = _mm256_insertf128_pd(a, b, 0);
+        let r = _mm256_insertf128_pd::<0>(a, b);
         let e = _mm256_setr_pd(5., 6., 3., 4.);
         assert_eq_m256d(r, e);
     }
@@ -3912,7 +3917,7 @@ mod tests {
     unsafe fn test_mm256_insertf128_si256() {
         let a = _mm256_setr_epi64x(1, 2, 3, 4);
         let b = _mm_setr_epi64x(5, 6);
-        let r = _mm256_insertf128_si256(a, b, 0);
+        let r = _mm256_insertf128_si256::<0>(a, b);
         let e = _mm256_setr_epi64x(5, 6, 3, 4);
         assert_eq_m256i(r, e);
     }

From 7415f3d6ec267204829732719dbbe9aeb0668d3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 21:14:27 +0100
Subject: [PATCH 051/123] convert `_mm_aeskeygenassist_si128` to const generics

---
 crates/core_arch/src/x86/aes.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/aes.rs b/crates/core_arch/src/x86/aes.rs
index 603744aef6..ffded1a0dc 100644
--- a/crates/core_arch/src/x86/aes.rs
+++ b/crates/core_arch/src/x86/aes.rs
@@ -87,21 +87,17 @@ pub unsafe fn _mm_aesimc_si128(a: __m128i) -> __m128i {
 ///
 /// Assist in expanding the AES cipher key by computing steps towards
 /// generating a round key for encryption cipher using data from `a` and an
-/// 8-bit round constant `imm8`.
+/// 8-bit round constant `IMM8`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aeskeygenassist_si128)
 #[inline]
 #[target_feature(enable = "aes")]
-#[cfg_attr(test, assert_instr(aeskeygenassist, imm8 = 0))]
-#[rustc_args_required_const(1)]
+#[cfg_attr(test, assert_instr(aeskeygenassist, IMM8 = 0))]
+#[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_aeskeygenassist_si128(a: __m128i, imm8: i32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            aeskeygenassist(a, $imm8)
-        };
-    }
-    constify_imm8!(imm8, call)
+pub unsafe fn _mm_aeskeygenassist_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    aeskeygenassist(a, IMM8 as u8)
 }
 
 #[cfg(test)]
@@ -169,7 +165,7 @@ mod tests {
         // Constants taken from https://msdn.microsoft.com/en-us/library/cc714138.aspx.
         let a = _mm_set_epi64x(0x0123456789abcdef, 0x8899aabbccddeeff);
         let e = _mm_set_epi64x(0x857c266b7c266e85, 0xeac4eea9c4eeacea);
-        let r = _mm_aeskeygenassist_si128(a, 5);
+        let r = _mm_aeskeygenassist_si128::<5>(a);
         assert_eq_m128i(r, e);
     }
 }

From d866ab5e1414fc56c6724adda6d2acb6a40ac422 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 21:15:31 +0100
Subject: [PATCH 052/123] convert `_mm_extract_epi64` to const generics

---
 crates/core_arch/src/x86_64/sse41.rs | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86_64/sse41.rs b/crates/core_arch/src/x86_64/sse41.rs
index 1b37967325..d0fd68d5f5 100644
--- a/crates/core_arch/src/x86_64/sse41.rs
+++ b/crates/core_arch/src/x86_64/sse41.rs
@@ -8,20 +8,17 @@ use crate::{
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
-/// Extracts an 64-bit integer from `a` selected with `imm8`
+/// Extracts an 64-bit integer from `a` selected with `IMM1`
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
-#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(pextrq, imm8 = 1))]
-#[rustc_args_required_const(1)]
+#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(pextrq, IMM1 = 1))]
+#[rustc_legacy_const_generics(1)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_extract_epi64(a: __m128i, imm8: i32) -> i64 {
-    let a = a.as_i64x2();
-    match imm8 & 1 {
-        0 => simd_extract(a, 0),
-        _ => simd_extract(a, 1),
-    }
+pub unsafe fn _mm_extract_epi64<const IMM1: i32>(a: __m128i) -> i64 {
+    static_assert_imm1!(IMM1);
+    simd_extract(a.as_i64x2(), IMM1 as u32)
 }
 
 /// Returns a copy of `a` with the 64-bit integer from `i` inserted at a
@@ -49,10 +46,10 @@ mod tests {
     #[simd_test(enable = "sse4.1")]
     unsafe fn test_mm_extract_epi64() {
         let a = _mm_setr_epi64x(0, 1);
-        let r = _mm_extract_epi64(a, 1);
-        assert_eq!(r, 1);
-        let r = _mm_extract_epi64(a, 3);
+        let r = _mm_extract_epi64::<1>(a);
         assert_eq!(r, 1);
+        let r = _mm_extract_epi64::<0>(a);
+        assert_eq!(r, 0);
     }
 
     #[simd_test(enable = "sse4.1")]

From 28063ea5fc04c193361c07bc28c8d4a98b6dc965 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 21:15:59 +0100
Subject: [PATCH 053/123] convert `_mm_insert_epi64` to const generics

---
 crates/core_arch/src/x86/avx2.rs     |  2 +-
 crates/core_arch/src/x86/test.rs     |  6 ++++--
 crates/core_arch/src/x86_64/sse41.rs | 20 +++++++++-----------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index 785b0fe9bb..81d7adc0bb 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -5135,7 +5135,7 @@ mod tests {
     #[simd_test(enable = "avx2")]
     unsafe fn test_mm256_sll_epi64() {
         let a = _mm256_set1_epi64x(0xFFFFFFFF);
-        let b = _mm_insert_epi64(_mm_set1_epi64x(0), 4, 0);
+        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
         let r = _mm256_sll_epi64(a, b);
         assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
     }
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index 9f577972fa..9f8b969301 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -92,14 +92,16 @@ pub unsafe fn get_m512i(a: __m512i, idx: usize) -> i64 {
 mod x86_polyfill {
     use crate::core_arch::x86::*;
 
-    pub unsafe fn _mm_insert_epi64(a: __m128i, val: i64, idx: i32) -> __m128i {
+    #[rustc_legacy_const_generics(2)]
+    pub unsafe fn _mm_insert_epi64<const INDEX: i32>(a: __m128i, val: i64) -> __m128i {
+        static_assert_imm1!(INDEX);
         #[repr(C)]
         union A {
             a: __m128i,
             b: [i64; 2],
         }
         let mut a = A { a };
-        a.b[idx as usize] = val;
+        a.b[INDEX as usize] = val;
         a.a
     }
 
diff --git a/crates/core_arch/src/x86_64/sse41.rs b/crates/core_arch/src/x86_64/sse41.rs
index d0fd68d5f5..3d1ea0cf65 100644
--- a/crates/core_arch/src/x86_64/sse41.rs
+++ b/crates/core_arch/src/x86_64/sse41.rs
@@ -22,20 +22,17 @@ pub unsafe fn _mm_extract_epi64<const IMM1: i32>(a: __m128i) -> i64 {
 }
 
 /// Returns a copy of `a` with the 64-bit integer from `i` inserted at a
-/// location specified by `imm8`.
+/// location specified by `IMM1`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
-#[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))]
-#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(pinsrq, IMM1 = 0))]
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_insert_epi64(a: __m128i, i: i64, imm8: i32) -> __m128i {
-    let a = a.as_i64x2();
-    match imm8 & 1 {
-        0 => transmute(simd_insert(a, 0, i)),
-        _ => transmute(simd_insert(a, 1, i)),
-    }
+pub unsafe fn _mm_insert_epi64<const IMM1: i32>(a: __m128i, i: i64) -> __m128i {
+    static_assert_imm1!(IMM1);
+    transmute(simd_insert(a.as_i64x2(), IMM1 as u32, i))
 }
 
 #[cfg(test)]
@@ -56,9 +53,10 @@ mod tests {
     unsafe fn test_mm_insert_epi64() {
         let a = _mm_set1_epi64x(0);
         let e = _mm_setr_epi64x(0, 32);
-        let r = _mm_insert_epi64(a, 32, 1);
+        let r = _mm_insert_epi64::<1>(a, 32);
         assert_eq_m128i(r, e);
-        let r = _mm_insert_epi64(a, 32, 3);
+        let e = _mm_setr_epi64x(32, 0);
+        let r = _mm_insert_epi64::<0>(a, 32);
         assert_eq_m128i(r, e);
     }
 }

From dd7f711a87b9d298777b1c7634dd3dd44ca297a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 21:25:17 +0100
Subject: [PATCH 054/123] convert `_mm256_extract_epi64` to const generics

---
 crates/core_arch/src/x86_64/avx2.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86_64/avx2.rs b/crates/core_arch/src/x86_64/avx2.rs
index 0f81cd221f..14447a1371 100644
--- a/crates/core_arch/src/x86_64/avx2.rs
+++ b/crates/core_arch/src/x86_64/avx2.rs
@@ -20,22 +20,17 @@
 
 use crate::core_arch::{simd_llvm::*, x86::*};
 
-/// Extracts a 64-bit integer from `a`, selected with `imm8`.
+/// Extracts a 64-bit integer from `a`, selected with `INDEX`.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
-#[rustc_args_required_const(1)]
+#[rustc_legacy_const_generics(1)]
 // This intrinsic has no corresponding instruction.
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm256_extract_epi64(a: __m256i, imm8: i32) -> i64 {
-    let a = a.as_i64x4();
-    match imm8 & 3 {
-        0 => simd_extract(a, 0),
-        1 => simd_extract(a, 1),
-        2 => simd_extract(a, 2),
-        _ => simd_extract(a, 3),
-    }
+pub unsafe fn _mm256_extract_epi64<const INDEX: i32>(a: __m256i) -> i64 {
+    static_assert_imm2!(INDEX);
+    simd_extract(a.as_i64x4(), INDEX as u32)
 }
 
 #[cfg(test)]
@@ -46,7 +41,7 @@ mod tests {
     #[simd_test(enable = "avx2")]
     unsafe fn test_mm256_extract_epi64() {
         let a = _mm256_setr_epi64x(0, 1, 2, 3);
-        let r = _mm256_extract_epi64(a, 3);
+        let r = _mm256_extract_epi64::<3>(a);
         assert_eq!(r, 3);
     }
 }

From 0bc1b079539ec37824c77ccb673774c72a49834d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 21:31:46 +0100
Subject: [PATCH 055/123] convert `_mm_sha1rnds4_epu32` to const generics

---
 crates/core_arch/src/x86/sha.rs | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/crates/core_arch/src/x86/sha.rs b/crates/core_arch/src/x86/sha.rs
index 362a97ccd3..cfb330cfbb 100644
--- a/crates/core_arch/src/x86/sha.rs
+++ b/crates/core_arch/src/x86/sha.rs
@@ -66,25 +66,18 @@ pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// Performs four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D)
 /// from `a` and some pre-computed sum of the next 4 round message values
 /// (unsigned 32-bit integers), and state variable E from `b`, and return the
-/// updated SHA1 state (A,B,C,D). `func` contains the logic functions and round
+/// updated SHA1 state (A,B,C,D). `FUNC` contains the logic functions and round
 /// constants.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1rnds4_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
-#[cfg_attr(test, assert_instr(sha1rnds4, func = 0))]
-#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(sha1rnds4, FUNC = 0))]
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_sha1rnds4_epu32(a: __m128i, b: __m128i, func: i32) -> __m128i {
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    macro_rules! call {
-        ($imm2:expr) => {
-            sha1rnds4(a, b, $imm2)
-        };
-    }
-    let ret = constify_imm2!(func, call);
-    transmute(ret)
+pub unsafe fn _mm_sha1rnds4_epu32<const FUNC: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm2!(FUNC);
+    transmute(sha1rnds4(a.as_i32x4(), b.as_i32x4(), FUNC as i8))
 }
 
 /// Performs an intermediate calculation for the next four SHA256 message values
@@ -179,19 +172,19 @@ mod tests {
         let a = _mm_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98);
         let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b);
         let expected = _mm_set_epi64x(0x32b13cd8322f5268, 0xc54420862bd9246f);
-        let r = _mm_sha1rnds4_epu32(a, b, 0);
+        let r = _mm_sha1rnds4_epu32::<0>(a, b);
         assert_eq_m128i(r, expected);
 
         let expected = _mm_set_epi64x(0x6d4c43e56a3c25d9, 0xa7e00fb775cbd3fe);
-        let r = _mm_sha1rnds4_epu32(a, b, 1);
+        let r = _mm_sha1rnds4_epu32::<1>(a, b);
         assert_eq_m128i(r, expected);
 
         let expected = _mm_set_epi64x(0xb304e383c01222f4, 0x66f6b3b1f89d8001);
-        let r = _mm_sha1rnds4_epu32(a, b, 2);
+        let r = _mm_sha1rnds4_epu32::<2>(a, b);
         assert_eq_m128i(r, expected);
 
         let expected = _mm_set_epi64x(0x8189b758bfabfa79, 0xdb08f6e78cae098b);
-        let r = _mm_sha1rnds4_epu32(a, b, 3);
+        let r = _mm_sha1rnds4_epu32::<3>(a, b);
         assert_eq_m128i(r, expected);
     }
 

From 1787257e8db11ab8d3000158c36027e51bce4290 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 21:32:36 +0100
Subject: [PATCH 056/123] remove unused constify_imm x86 macro

---
 crates/core_arch/src/x86/macros.rs | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index 47ceaeb20a..8cacbf44c5 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -48,18 +48,6 @@ macro_rules! constify_imm3 {
     };
 }
 
-macro_rules! constify_imm2 {
-    ($imm8:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm8) & 0b11 {
-            0 => $expand!(0),
-            1 => $expand!(1),
-            2 => $expand!(2),
-            _ => $expand!(3),
-        }
-    };
-}
-
 // Constifies 5 bits along with an sae option without rounding control.
 // See: https://github.com/llvm/llvm-project/blob/bd50cf905fa7c0c7caa134301c6ca0658c81eeb1/clang/lib/Sema/SemaChecking.cpp#L3497
 #[allow(unused)]

From b4a176d1da8d12ce5fa7e86ad4033c92f4a5fdff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 21:58:13 +0100
Subject: [PATCH 057/123] convert `_mm_clmulepi64_si128` to const generics

---
 crates/core_arch/src/x86/avx512vpclmulqdq.rs | 16 +++++-----
 crates/core_arch/src/x86/pclmulqdq.rs        | 32 +++++++++-----------
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512vpclmulqdq.rs b/crates/core_arch/src/x86/avx512vpclmulqdq.rs
index 831ab7f642..b4e472dc3f 100644
--- a/crates/core_arch/src/x86/avx512vpclmulqdq.rs
+++ b/crates/core_arch/src/x86/avx512vpclmulqdq.rs
@@ -221,19 +221,19 @@ mod tests {
         );
 
         verify_512_helper(
-            |a, b| _mm_clmulepi64_si128(a, b, 0x00),
+            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
             |a, b| _mm512_clmulepi64_epi128(a, b, 0x00),
         );
         verify_512_helper(
-            |a, b| _mm_clmulepi64_si128(a, b, 0x01),
+            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
             |a, b| _mm512_clmulepi64_epi128(a, b, 0x01),
         );
         verify_512_helper(
-            |a, b| _mm_clmulepi64_si128(a, b, 0x10),
+            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
             |a, b| _mm512_clmulepi64_epi128(a, b, 0x10),
         );
         verify_512_helper(
-            |a, b| _mm_clmulepi64_si128(a, b, 0x11),
+            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
             |a, b| _mm512_clmulepi64_epi128(a, b, 0x11),
         );
     }
@@ -247,19 +247,19 @@ mod tests {
         );
 
         verify_256_helper(
-            |a, b| _mm_clmulepi64_si128(a, b, 0x00),
+            |a, b| _mm_clmulepi64_si128::<0x00>(a, b),
             |a, b| _mm256_clmulepi64_epi128(a, b, 0x00),
         );
         verify_256_helper(
-            |a, b| _mm_clmulepi64_si128(a, b, 0x01),
+            |a, b| _mm_clmulepi64_si128::<0x01>(a, b),
             |a, b| _mm256_clmulepi64_epi128(a, b, 0x01),
         );
         verify_256_helper(
-            |a, b| _mm_clmulepi64_si128(a, b, 0x10),
+            |a, b| _mm_clmulepi64_si128::<0x10>(a, b),
             |a, b| _mm256_clmulepi64_epi128(a, b, 0x10),
         );
         verify_256_helper(
-            |a, b| _mm_clmulepi64_si128(a, b, 0x11),
+            |a, b| _mm_clmulepi64_si128::<0x11>(a, b),
             |a, b| _mm256_clmulepi64_epi128(a, b, 0x11),
         );
     }
diff --git a/crates/core_arch/src/x86/pclmulqdq.rs b/crates/core_arch/src/x86/pclmulqdq.rs
index 0e1bebae9e..6ccf3a62a6 100644
--- a/crates/core_arch/src/x86/pclmulqdq.rs
+++ b/crates/core_arch/src/x86/pclmulqdq.rs
@@ -25,20 +25,16 @@ extern "C" {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128)
 #[inline]
 #[target_feature(enable = "pclmulqdq")]
-#[cfg_attr(all(test, not(target_os = "linux")), assert_instr(pclmulqdq, imm8 = 0))]
-#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmullqlqdq, imm8 = 0))]
-#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmulhqlqdq, imm8 = 1))]
-#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmullqhqdq, imm8 = 16))]
-#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmulhqhqdq, imm8 = 17))]
-#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, not(target_os = "linux")), assert_instr(pclmulqdq, IMM8 = 0))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmullqlqdq, IMM8 = 0))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmulhqlqdq, IMM8 = 1))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmullqhqdq, IMM8 = 16))]
+#[cfg_attr(all(test, target_os = "linux"), assert_instr(pclmulhqhqdq, IMM8 = 17))]
+#[rustc_legacy_const_generics(2)]
 #[stable(feature = "simd_x86", since = "1.27.0")]
-pub unsafe fn _mm_clmulepi64_si128(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            pclmulqdq(a, b, $imm8)
-        };
-    }
-    constify_imm8!(imm8, call)
+pub unsafe fn _mm_clmulepi64_si128<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    pclmulqdq(a, b, IMM8 as u8)
 }
 
 #[cfg(test)]
@@ -62,13 +58,13 @@ mod tests {
         let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
         let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
 
-        assert_eq_m128i(_mm_clmulepi64_si128(a, b, 0x00), r00);
-        assert_eq_m128i(_mm_clmulepi64_si128(a, b, 0x10), r01);
-        assert_eq_m128i(_mm_clmulepi64_si128(a, b, 0x01), r10);
-        assert_eq_m128i(_mm_clmulepi64_si128(a, b, 0x11), r11);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x00>(a, b), r00);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x10>(a, b), r01);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x01>(a, b), r10);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x11>(a, b), r11);
 
         let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
         let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
-        assert_eq_m128i(_mm_clmulepi64_si128(a0, a0, 0x00), r);
+        assert_eq_m128i(_mm_clmulepi64_si128::<0x00>(a0, a0), r);
     }
 }

From 23ea66383579dc8e5fe7f3307dd16c89c02b66c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:10:08 +0100
Subject: [PATCH 058/123] convert `_mm512_cmp_epu16_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index b6fa9d254a..3a7f397292 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3689,22 +3689,18 @@ pub unsafe fn _mm_mask_cmpneq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -
     _mm_cmpneq_epi8_mask(a, b) & k1
 }
 
-/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
+/// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by `IMM8`, and store the results in mask vector k.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu16_mask&expand=715)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm512_cmp_epu16_mask(a: __m512i, b: __m512i, imm8: i32) -> __mmask32 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u16x32();
     let b = b.as_u16x32();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpuw(a, b, $imm3, 0b11111111_11111111_11111111_11111111)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpuw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
     transmute(r)
 }
 
@@ -13456,7 +13452,7 @@ mod tests {
     unsafe fn test_mm512_cmp_epu16_mask() {
         let a = _mm512_set1_epi16(0);
         let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmp_epu16_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm512_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 

From d56aa643b9d68f8bf4947190604f7384f344616a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:23:55 +0100
Subject: [PATCH 059/123] convert `_mm512_mask_cmp_epu16_mask` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 3a7f397292..1feebf50bb 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3709,22 +3709,17 @@ pub unsafe fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu16_mask&expand=716)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm512_mask_cmp_epu16_mask(
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
     k1: __mmask32,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __mmask32 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u16x32();
     let b = b.as_u16x32();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpuw(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpuw(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13461,7 +13456,7 @@ mod tests {
         let a = _mm512_set1_epi16(0);
         let b = _mm512_set1_epi16(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epu16_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 

From 80a9a08a5a34ec11696cf10a6a0368ea43fb4f16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:27:28 +0100
Subject: [PATCH 060/123] convert `_mm256_cmp_epu16_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 1feebf50bb..9bfc4e9d44 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3728,17 +3728,13 @@ pub unsafe fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epu16_mask&expand=713)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm256_cmp_epu16_mask(a: __m256i, b: __m256i, imm8: i32) -> __mmask16 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u16x16();
     let b = b.as_u16x16();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpuw256(a, b, $imm3, 0b11111111_11111111)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpuw256(a, b, IMM8, 0b11111111_11111111);
     transmute(r)
 }
 
@@ -13464,7 +13460,7 @@ mod tests {
     unsafe fn test_mm256_cmp_epu16_mask() {
         let a = _mm256_set1_epi16(0);
         let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmp_epu16_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm256_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 

From 5fc82f5ead8ee4e1ff5bb2d5c138fddb49f0611f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:29:17 +0100
Subject: [PATCH 061/123] convert `_mm256_mask_cmp_epu16_mask` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 9bfc4e9d44..4df9389029 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3743,22 +3743,17 @@ pub unsafe fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epu16_mask&expand=714)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm256_mask_cmp_epu16_mask(
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
     k1: __mmask16,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __mmask16 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u16x16();
     let b = b.as_u16x16();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpuw256(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpuw256(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13469,7 +13464,7 @@ mod tests {
         let a = _mm256_set1_epi16(0);
         let b = _mm256_set1_epi16(1);
         let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmp_epu16_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 

From 9bd881c00ba0eb695ddcf7973b04a8e6637333ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:30:13 +0100
Subject: [PATCH 062/123] convert `_mm_cmp_epu16_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 4df9389029..74e0ef6c0d 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3762,17 +3762,13 @@ pub unsafe fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epu16_mask&expand=711)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm_cmp_epu16_mask(a: __m128i, b: __m128i, imm8: i32) -> __mmask8 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u16x8();
     let b = b.as_u16x8();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpuw128(a, b, $imm3, 0b11111111)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpuw128(a, b, IMM8, 0b11111111);
     transmute(r)
 }
 
@@ -13472,7 +13468,7 @@ mod tests {
     unsafe fn test_mm_cmp_epu16_mask() {
         let a = _mm_set1_epi16(0);
         let b = _mm_set1_epi16(1);
-        let m = _mm_cmp_epu16_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm_cmp_epu16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111);
     }
 

From 27e36f3e179970ced38743a32e9e9758303dfd28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:32:50 +0100
Subject: [PATCH 063/123] convert `_mm_mask_cmp_epu16_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 74e0ef6c0d..f7b916d7c0 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3777,17 +3777,17 @@ pub unsafe fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epu16_mask&expand=712)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm_mask_cmp_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __mmask8 {
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u16x8();
     let b = b.as_u16x8();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpuw128(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpuw128(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13477,7 +13477,7 @@ mod tests {
         let a = _mm_set1_epi16(0);
         let b = _mm_set1_epi16(1);
         let mask = 0b01010101;
-        let r = _mm_mask_cmp_epu16_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101);
     }
 

From 72f5b431839cde31c11e06a9d59a727717957c51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:33:56 +0100
Subject: [PATCH 064/123] convert `_mm512_cmp_epu8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index f7b916d7c0..4ad3c0901f 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3796,22 +3796,18 @@ pub unsafe fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu8_mask&expand=733)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm512_cmp_epu8_mask(a: __m512i, b: __m512i, imm8: i32) -> __mmask64 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpub(
-                a,
-                b,
-                $imm3,
-                0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            )
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpub(
+        a,
+        b,
+        IMM8,
+        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+    );
     transmute(r)
 }
 
@@ -13485,7 +13481,7 @@ mod tests {
     unsafe fn test_mm512_cmp_epu8_mask() {
         let a = _mm512_set1_epi8(0);
         let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmp_epu8_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm512_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(
             m,
             0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111

From bf834810fed16b6e82a500fb7bf028c1031b9140 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:35:11 +0100
Subject: [PATCH 065/123] convert `_mm512_mask_cmp_epu8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 4ad3c0901f..51f83f4fa5 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3816,22 +3816,17 @@ pub unsafe fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu8_mask&expand=734)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm512_mask_cmp_epu8_mask(
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
     k1: __mmask64,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __mmask64 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpub(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpub(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13493,7 +13488,7 @@ mod tests {
         let a = _mm512_set1_epi8(0);
         let b = _mm512_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epu8_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(
             r,
             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101

From c5a2a4c4fe5f182c8d8aa032281ab03277b55fb3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:35:57 +0100
Subject: [PATCH 066/123] convert `_mm256_cmp_epu8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 51f83f4fa5..e74b987005 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3835,17 +3835,13 @@ pub unsafe fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epu8_mask&expand=731)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm256_cmp_epu8_mask(a: __m256i, b: __m256i, imm8: i32) -> __mmask32 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpub256(a, b, $imm3, 0b11111111_11111111_11111111_11111111)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpub256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
     transmute(r)
 }
 
@@ -13499,7 +13495,7 @@ mod tests {
     unsafe fn test_mm256_cmp_epu8_mask() {
         let a = _mm256_set1_epi8(0);
         let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmp_epu8_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm256_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 

From ff17cf8debbb47a3a977c6b62e52e140f02f14f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:36:44 +0100
Subject: [PATCH 067/123] convert `_mm256_mask_cmp_epu8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index e74b987005..3905a35227 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3850,22 +3850,17 @@ pub unsafe fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epu8_mask&expand=732)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm256_mask_cmp_epu8_mask(
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
     k1: __mmask32,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __mmask32 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpub256(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpub256(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13504,7 +13499,7 @@ mod tests {
         let a = _mm256_set1_epi8(0);
         let b = _mm256_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmp_epu8_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 

From 990616d867961634f3e344770a140e4dc6fc8412 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:37:38 +0100
Subject: [PATCH 068/123] convert `_mm_cmp_epu8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 3905a35227..4f1c271401 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3869,17 +3869,13 @@ pub unsafe fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epu8_mask&expand=729)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm_cmp_epu8_mask(a: __m128i, b: __m128i, imm8: i32) -> __mmask16 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpub128(a, b, $imm3, 0b11111111_11111111)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpub128(a, b, IMM8, 0b11111111_11111111);
     transmute(r)
 }
 
@@ -13507,7 +13503,7 @@ mod tests {
     unsafe fn test_mm_cmp_epu8_mask() {
         let a = _mm_set1_epi8(0);
         let b = _mm_set1_epi8(1);
-        let m = _mm_cmp_epu8_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm_cmp_epu8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 

From ede02dd26dbd505a9ee6366b01d680df06688e1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:38:29 +0100
Subject: [PATCH 069/123] convert `_mm_mask_cmp_epu8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 4f1c271401..e011cb4749 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3884,22 +3884,17 @@ pub unsafe fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epu8_mask&expand=730)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm_mask_cmp_epu8_mask(
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(
     k1: __mmask16,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __mmask16 {
+    static_assert_imm3!(IMM8);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpub128(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpub128(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13512,7 +13507,7 @@ mod tests {
         let a = _mm_set1_epi8(0);
         let b = _mm_set1_epi8(1);
         let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmp_epu8_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 

From e2b3855675741a2412eb01bf3cd67626e793dce0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:39:20 +0100
Subject: [PATCH 070/123] convert `_mm512_cmp_epi16_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index e011cb4749..37b882353b 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3903,17 +3903,13 @@ pub unsafe fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi16_mask&expand=691)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm512_cmp_epi16_mask(a: __m512i, b: __m512i, imm8: i32) -> __mmask32 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i16x32();
     let b = b.as_i16x32();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpw(a, b, $imm3, 0b11111111_11111111_11111111_11111111)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpw(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
     transmute(r)
 }
 
@@ -13515,7 +13511,7 @@ mod tests {
     unsafe fn test_mm512_cmp_epi16_mask() {
         let a = _mm512_set1_epi16(0);
         let b = _mm512_set1_epi16(1);
-        let m = _mm512_cmp_epi16_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm512_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 

From 558016c38d940d11ea796052333c7a4eecd331d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:40:06 +0100
Subject: [PATCH 071/123] convert `_mm512_mask_cmp_epi16_mask` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 37b882353b..933e829990 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3918,22 +3918,17 @@ pub unsafe fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi16_mask&expand=692)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm512_mask_cmp_epi16_mask(
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
     k1: __mmask32,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __mmask32 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i16x32();
     let b = b.as_i16x32();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpw(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpw(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13520,7 +13515,7 @@ mod tests {
         let a = _mm512_set1_epi16(0);
         let b = _mm512_set1_epi16(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epi16_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 

From 5b10e43ca4518e9460ea44fd139da45258b96f8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:41:44 +0100
Subject: [PATCH 072/123] convert `_mm256_cmp_epi16_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 933e829990..a249057a0d 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3937,17 +3937,13 @@ pub unsafe fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epi16_mask&expand=689)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm256_cmp_epi16_mask(a: __m256i, b: __m256i, imm8: i32) -> __mmask16 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i16x16();
     let b = b.as_i16x16();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpw256(a, b, $imm3, 0b11111111_11111111)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpw256(a, b, IMM8, 0b11111111_11111111);
     transmute(r)
 }
 
@@ -13523,7 +13519,7 @@ mod tests {
     unsafe fn test_mm256_cmp_epi16_mask() {
         let a = _mm256_set1_epi16(0);
         let b = _mm256_set1_epi16(1);
-        let m = _mm256_cmp_epi16_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm256_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 

From 2551d9bd2f7bd27e49531ead7eae854e8c80576c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:42:46 +0100
Subject: [PATCH 073/123] convert `_mm256_mask_cmp_epi16_mask` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index a249057a0d..8bf8dfd1da 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3952,22 +3952,17 @@ pub unsafe fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epi16_mask&expand=690)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm256_mask_cmp_epi16_mask(
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
     k1: __mmask16,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __mmask16 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i16x16();
     let b = b.as_i16x16();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpw256(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpw256(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13528,7 +13523,7 @@ mod tests {
         let a = _mm256_set1_epi16(0);
         let b = _mm256_set1_epi16(1);
         let mask = 0b01010101_01010101;
-        let r = _mm256_mask_cmp_epi16_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 

From e9ebb9433edc335d3adf8f32a826ac099fa1601f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:43:33 +0100
Subject: [PATCH 074/123] convert `_mm_cmp_epi16_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 8bf8dfd1da..c905cdf4a6 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3971,17 +3971,13 @@ pub unsafe fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epi16_mask&expand=687)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm_cmp_epi16_mask(a: __m128i, b: __m128i, imm8: i32) -> __mmask8 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i16x8();
     let b = b.as_i16x8();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpw128(a, b, $imm3, 0b11111111)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpw128(a, b, IMM8, 0b11111111);
     transmute(r)
 }
 
@@ -13531,7 +13527,7 @@ mod tests {
     unsafe fn test_mm_cmp_epi16_mask() {
         let a = _mm_set1_epi16(0);
         let b = _mm_set1_epi16(1);
-        let m = _mm_cmp_epi16_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm_cmp_epi16_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111);
     }
 

From e84657a465f25cbfac68ea70036a3cb78cf5a725 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:44:18 +0100
Subject: [PATCH 075/123] convert `_mm_mask_cmp_epi16_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index c905cdf4a6..e2cfe4b0ac 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -3986,17 +3986,17 @@ pub unsafe fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epi16_mask&expand=688)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm_mask_cmp_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __mmask8 {
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(
+    k1: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __mmask8 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i16x8();
     let b = b.as_i16x8();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpw128(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpw128(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13536,7 +13536,7 @@ mod tests {
         let a = _mm_set1_epi16(0);
         let b = _mm_set1_epi16(1);
         let mask = 0b01010101;
-        let r = _mm_mask_cmp_epi16_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101);
     }
 

From dbbcb17293df2b624ef92721a382d0b9a8e0f7e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:44:58 +0100
Subject: [PATCH 076/123] convert `_mm512_cmp_epi8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index e2cfe4b0ac..59e6372711 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -4005,22 +4005,18 @@ pub unsafe fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi8_mask&expand=709)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm512_cmp_epi8_mask(a: __m512i, b: __m512i, imm8: i32) -> __mmask64 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i8x64();
     let b = b.as_i8x64();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpb(
-                a,
-                b,
-                $imm3,
-                0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
-            )
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpb(
+        a,
+        b,
+        IMM8,
+        0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
+    );
     transmute(r)
 }
 
@@ -13544,7 +13540,7 @@ mod tests {
     unsafe fn test_mm512_cmp_epi8_mask() {
         let a = _mm512_set1_epi8(0);
         let b = _mm512_set1_epi8(1);
-        let m = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm512_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(
             m,
             0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111

From 2693a72ff043af351888296f3b9a37d3ee003c92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:46:06 +0100
Subject: [PATCH 077/123] convert `_mm512_mask_cmp_epi8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 59e6372711..b0da4936d7 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -4025,22 +4025,17 @@ pub unsafe fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi8_mask&expand=710)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm512_mask_cmp_epi8_mask(
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
     k1: __mmask64,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __mmask64 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i8x64();
     let b = b.as_i8x64();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpb(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpb(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13552,7 +13547,7 @@ mod tests {
         let a = _mm512_set1_epi8(0);
         let b = _mm512_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101;
-        let r = _mm512_mask_cmp_epi8_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(
             r,
             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101

From 28bbe5c1ad64402b4939b2da719c7192a4e19db6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:46:52 +0100
Subject: [PATCH 078/123] convert `_mm256_cmp_epi8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index b0da4936d7..cdf049b1ce 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -4044,17 +4044,13 @@ pub unsafe fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epi8_mask&expand=707)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm256_cmp_epi8_mask(a: __m256i, b: __m256i, imm8: i32) -> __mmask32 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i8x32();
     let b = b.as_i8x32();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpb256(a, b, $imm3, 0b11111111_11111111_11111111_11111111)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpb256(a, b, IMM8, 0b11111111_11111111_11111111_11111111);
     transmute(r)
 }
 
@@ -13558,7 +13554,7 @@ mod tests {
     unsafe fn test_mm256_cmp_epi8_mask() {
         let a = _mm256_set1_epi8(0);
         let b = _mm256_set1_epi8(1);
-        let m = _mm256_cmp_epi8_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm256_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111_11111111_11111111);
     }
 

From e6e04fdee5ef99601d207790ab645dd71fedd095 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:47:40 +0100
Subject: [PATCH 079/123] convert `_mm256_mask_cmp_epi8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index cdf049b1ce..cb1cd29c92 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -4059,22 +4059,17 @@ pub unsafe fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epi8_mask&expand=708)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm256_mask_cmp_epi8_mask(
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
     k1: __mmask32,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __mmask32 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i8x32();
     let b = b.as_i8x32();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpb256(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpb256(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13563,7 +13558,7 @@ mod tests {
         let a = _mm256_set1_epi8(0);
         let b = _mm256_set1_epi8(1);
         let mask = 0b01010101_01010101_01010101_01010101;
-        let r = _mm256_mask_cmp_epi8_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101_01010101_01010101);
     }
 

From 930d81255cacfb30d278fdaf9b5c4f1a38345a40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:48:46 +0100
Subject: [PATCH 080/123] convert `_mm_cmp_epi8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index cb1cd29c92..93de2e0514 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -4078,17 +4078,13 @@ pub unsafe fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epi8_mask&expand=705)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm_cmp_epi8_mask(a: __m128i, b: __m128i, imm8: i32) -> __mmask16 {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i8x16();
     let b = b.as_i8x16();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpb128(a, b, $imm3, 0b11111111_11111111)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpb128(a, b, IMM8, 0b11111111_11111111);
     transmute(r)
 }
 
@@ -13566,7 +13562,7 @@ mod tests {
     unsafe fn test_mm_cmp_epi8_mask() {
         let a = _mm_set1_epi8(0);
         let b = _mm_set1_epi8(1);
-        let m = _mm_cmp_epi8_mask(a, b, _MM_CMPINT_LT);
+        let m = _mm_cmp_epi8_mask::<_MM_CMPINT_LT>(a, b);
         assert_eq!(m, 0b11111111_11111111);
     }
 

From 513b2986b606da90d758a0f021e6d0fcded6afb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 22:49:27 +0100
Subject: [PATCH 081/123] convert `_mm_mask_cmp_epi8_mask` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 93de2e0514..5f48d76cb7 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -4093,22 +4093,17 @@ pub unsafe fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epi8_mask&expand=706)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
-pub unsafe fn _mm_mask_cmp_epi8_mask(
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
+pub unsafe fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(
     k1: __mmask16,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __mmask16 {
+    static_assert_imm3!(IMM8);
     let a = a.as_i8x16();
     let b = b.as_i8x16();
-    macro_rules! call {
-        ($imm3:expr) => {
-            vpcmpb128(a, b, $imm3, k1)
-        };
-    }
-    let r = constify_imm3!(imm8, call);
+    let r = vpcmpb128(a, b, IMM8, k1);
     transmute(r)
 }
 
@@ -13571,7 +13566,7 @@ mod tests {
         let a = _mm_set1_epi8(0);
         let b = _mm_set1_epi8(1);
         let mask = 0b01010101_01010101;
-        let r = _mm_mask_cmp_epi8_mask(mask, a, b, _MM_CMPINT_LT);
+        let r = _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(mask, a, b);
         assert_eq!(r, 0b01010101_01010101);
     }
 

From cf03e05d894cfe5d5ac2856f2767cd5f767c949f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 23:23:51 +0100
Subject: [PATCH 082/123] convert `_mm512_slli_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 crates/core_arch/src/x86/macros.rs   | 17 +++++++++++++++++
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 5f48d76cb7..61614f8ff3 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5008,16 +5008,12 @@ pub unsafe fn _mm_maskz_sll_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_slli_epi16&expand=5301)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpsllw, imm8 = 5))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_slli_epi16(a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_slli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
     let a = a.as_i16x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpslliw(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vpslliw(a, IMM8);
     transmute(r)
 }
 
@@ -14476,7 +14472,7 @@ mod tests {
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_slli_epi16() {
         let a = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_slli_epi16(a, 1);
+        let r = _mm512_slli_epi16::<1>(a);
         let e = _mm512_set1_epi16(0);
         assert_eq_m512i(r, e);
     }
diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index 8cacbf44c5..8a6f025d53 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -32,6 +32,23 @@ macro_rules! static_assert_sae {
     };
 }
 
+// Helper struct used to trigger const eval errors when the unsigned const generic immediate value
+// `IMM` is out of `[MIN-MAX]` range.
+pub(crate) struct ValidateConstImmU32<const IMM: u32, const MIN: u32, const MAX: u32>;
+impl<const IMM: u32, const MIN: u32, const MAX: u32> ValidateConstImmU32<IMM, MIN, MAX> {
+    pub(crate) const VALID: () = {
+        let _ = 1 / ((IMM >= MIN && IMM <= MAX) as usize);
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_imm_u8 {
+    ($imm:ident) => {
+        let _ =
+            $crate::core_arch::x86::macros::ValidateConstImmU32::<$imm, 0, { (1 << 8) - 1 }>::VALID;
+    };
+}
+
 macro_rules! constify_imm3 {
     ($imm8:expr, $expand:ident) => {
         #[allow(overflowing_literals)]

From d641e7d2b376017ed77fac67ab72742114def126 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 23:27:08 +0100
Subject: [PATCH 083/123] convert `_mm512_mask_slli_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 61614f8ff3..dd0790bf86 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5022,16 +5022,16 @@ pub unsafe fn _mm512_slli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_slli_epi16&expand=5299)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpsllw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_slli_epi16(src: __m512i, k: __mmask32, a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_slli_epi16<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
     let a = a.as_i16x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpslliw(a, $imm8)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+    let shf = vpslliw(a, IMM8);
     transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
 }
 
@@ -14480,9 +14480,9 @@ mod tests {
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_mask_slli_epi16() {
         let a = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_mask_slli_epi16(a, 0, a, 1);
+        let r = _mm512_mask_slli_epi16::<1>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_slli_epi16(a, 0b11111111_11111111_11111111_11111111, a, 1);
+        let r = _mm512_mask_slli_epi16::<1>(a, 0b11111111_11111111_11111111_11111111, a);
         let e = _mm512_set1_epi16(0);
         assert_eq_m512i(r, e);
     }

From af219716901fbb1f3ee0ecd9720571665c902fb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 23:28:48 +0100
Subject: [PATCH 084/123] convert `_mm512_maskz_slli_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index dd0790bf86..1589bd4b95 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5040,16 +5040,12 @@ pub unsafe fn _mm512_mask_slli_epi16<const IMM8: u32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_slli_epi16&expand=5300)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpsllw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_slli_epi16(k: __mmask32, a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_slli_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
     let a = a.as_i16x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpslliw(a, $imm8)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+    let shf = vpslliw(a, IMM8);
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -14490,9 +14486,9 @@ mod tests {
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_maskz_slli_epi16() {
         let a = _mm512_set1_epi16(1 << 15);
-        let r = _mm512_maskz_slli_epi16(0, a, 1);
+        let r = _mm512_maskz_slli_epi16::<1>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_slli_epi16(0b11111111_11111111_11111111_11111111, a, 1);
+        let r = _mm512_maskz_slli_epi16::<1>(0b11111111_11111111_11111111_11111111, a);
         let e = _mm512_set1_epi16(0);
         assert_eq_m512i(r, e);
     }

From fa85d17e34ead9efe943f451fdd2d9f6ddccd9b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:08:24 +0100
Subject: [PATCH 085/123] convert `_mm512_srli_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 1589bd4b95..1c863a6d10 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5328,16 +5328,12 @@ pub unsafe fn _mm_maskz_srl_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srli_epi16&expand=5513)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpsrlw, imm8 = 5))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_srli_epi16(a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_srli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
     let a = a.as_i16x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsrliw(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vpsrliw(a, IMM8);
     transmute(r)
 }
 
@@ -14704,7 +14700,7 @@ mod tests {
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_srli_epi16() {
         let a = _mm512_set1_epi16(1 << 1);
-        let r = _mm512_srli_epi16(a, 2);
+        let r = _mm512_srli_epi16::<2>(a);
         let e = _mm512_set1_epi16(0);
         assert_eq_m512i(r, e);
     }

From e38485e5919b889008b2d5999a24594dd78a55f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:09:50 +0100
Subject: [PATCH 086/123] convert `_mm512_mask_srli_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 1c863a6d10..ccd5d27a3d 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5342,16 +5342,16 @@ pub unsafe fn _mm512_srli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srli_epi16&expand=5511)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpsrlw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_srli_epi16(src: __m512i, k: __mmask32, a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_srli_epi16<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
     let a = a.as_i16x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsrliw(a, $imm8)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+    let shf = vpsrliw(a, IMM8);
     transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
 }
 
@@ -14708,9 +14708,9 @@ mod tests {
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_mask_srli_epi16() {
         let a = _mm512_set1_epi16(1 << 1);
-        let r = _mm512_mask_srli_epi16(a, 0, a, 2);
+        let r = _mm512_mask_srli_epi16::<2>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_srli_epi16(a, 0b11111111_11111111_11111111_11111111, a, 2);
+        let r = _mm512_mask_srli_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a);
         let e = _mm512_set1_epi16(0);
         assert_eq_m512i(r, e);
     }

From e7b263692bbe330f0ef27a8ae6eeec684b861813 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:14:10 +0100
Subject: [PATCH 087/123] convert `_mm512_maskz_srli_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index ccd5d27a3d..8c5e176f7c 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5360,17 +5360,13 @@ pub unsafe fn _mm512_mask_srli_epi16<const IMM8: u32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srli_epi16&expand=5512)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpsrlw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_srli_epi16(k: __mmask32, a: __m512i, imm8: i32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_srli_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     //imm8 should be u32, it seems the document to verify is incorrect
     let a = a.as_i16x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsrliw(a, $imm8)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+    let shf = vpsrliw(a, IMM8 as u32);
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -14718,9 +14714,9 @@ mod tests {
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_maskz_srli_epi16() {
         let a = _mm512_set1_epi16(1 << 1);
-        let r = _mm512_maskz_srli_epi16(0, a, 2);
+        let r = _mm512_maskz_srli_epi16::<2>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srli_epi16(0b11111111_11111111_11111111_11111111, a, 2);
+        let r = _mm512_maskz_srli_epi16::<2>(0b11111111_11111111_11111111_11111111, a);
         let e = _mm512_set1_epi16(0);
         assert_eq_m512i(r, e);
     }

From af23e95974a157989f8076bf22c73d7b1fb4a891 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:16:50 +0100
Subject: [PATCH 088/123] convert `_mm256_mask_srli_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 8c5e176f7c..9bb15db5e5 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5376,15 +5376,15 @@ pub unsafe fn _mm512_maskz_srli_epi16<const IMM8: i32>(k: __mmask32, a: __m512i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srli_epi16&expand=5508)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrlw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_srli_epi16(src: __m256i, k: __mmask16, a: __m256i, imm8: i32) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_srli_epi16::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_srli_epi16<const IMM8: i32>(
+    src: __m256i,
+    k: __mmask16,
+    a: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shf = _mm256_srli_epi16::<IMM8>(a);
     transmute(simd_select_bitmask(k, shf.as_i16x16(), src.as_i16x16()))
 }
 
@@ -14724,9 +14724,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm256_mask_srli_epi16() {
         let a = _mm256_set1_epi16(1 << 1);
-        let r = _mm256_mask_srli_epi16(a, 0, a, 2);
+        let r = _mm256_mask_srli_epi16::<2>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_srli_epi16(a, 0b11111111_11111111, a, 2);
+        let r = _mm256_mask_srli_epi16::<2>(a, 0b11111111_11111111, a);
         let e = _mm256_set1_epi16(0);
         assert_eq_m256i(r, e);
     }

From 4150a3947e63cb052e3e5570d6f55acbb7ecc8fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:26:51 +0100
Subject: [PATCH 089/123] convert `_mm256_maskz_srli_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 9bb15db5e5..6a9a29985d 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5393,15 +5393,11 @@ pub unsafe fn _mm256_mask_srli_epi16<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srli_epi16&expand=5509)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrlw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_srli_epi16(k: __mmask16, a: __m256i, imm8: i32) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_srli_epi16::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_srli_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shf = _mm256_srli_epi16::<IMM8>(a);
     let zero = _mm256_setzero_si256().as_i16x16();
     transmute(simd_select_bitmask(k, shf.as_i16x16(), zero))
 }
@@ -14734,9 +14730,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm256_maskz_srli_epi16() {
         let a = _mm256_set1_epi16(1 << 1);
-        let r = _mm256_maskz_srli_epi16(0, a, 2);
+        let r = _mm256_maskz_srli_epi16::<2>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_srli_epi16(0b11111111_11111111, a, 2);
+        let r = _mm256_maskz_srli_epi16::<2>(0b11111111_11111111, a);
         let e = _mm256_set1_epi16(0);
         assert_eq_m256i(r, e);
     }

From 73aae2a24ff5129e917f97145861cb6952fa723d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:28:27 +0100
Subject: [PATCH 090/123] convert `_mm_mask_srli_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 6a9a29985d..ba88db340f 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5407,15 +5407,15 @@ pub unsafe fn _mm256_maskz_srli_epi16<const IMM8: i32>(k: __mmask16, a: __m256i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srli_epi16&expand=5505)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrlw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_srli_epi16(src: __m128i, k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_srli_epi16::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_srli_epi16<const IMM8: i32>(
+    src: __m128i,
+    k: __mmask8,
+    a: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf = _mm_srli_epi16::<IMM8>(a);
     transmute(simd_select_bitmask(k, shf.as_i16x8(), src.as_i16x8()))
 }
 
@@ -14740,9 +14740,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm_mask_srli_epi16() {
         let a = _mm_set1_epi16(1 << 1);
-        let r = _mm_mask_srli_epi16(a, 0, a, 2);
+        let r = _mm_mask_srli_epi16::<2>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_srli_epi16(a, 0b11111111, a, 2);
+        let r = _mm_mask_srli_epi16::<2>(a, 0b11111111, a);
         let e = _mm_set1_epi16(0);
         assert_eq_m128i(r, e);
     }

From 0efea1cf774b5a0ed762007b500cef1d7562c35b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:30:12 +0100
Subject: [PATCH 091/123] convert `_mm_maskz_srli_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index ba88db340f..fa954a64d4 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5424,15 +5424,11 @@ pub unsafe fn _mm_mask_srli_epi16<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srli_epi16&expand=5506)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpsrlw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_srli_epi16(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_srli_epi16::<$imm8>(a)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_srli_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shf = _mm_srli_epi16::<IMM8>(a);
     let zero = _mm_setzero_si128().as_i16x8();
     transmute(simd_select_bitmask(k, shf.as_i16x8(), zero))
 }
@@ -14750,9 +14746,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm_maskz_srli_epi16() {
         let a = _mm_set1_epi16(1 << 1);
-        let r = _mm_maskz_srli_epi16(0, a, 2);
+        let r = _mm_maskz_srli_epi16::<2>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_srli_epi16(0b11111111, a, 2);
+        let r = _mm_maskz_srli_epi16::<2>(0b11111111, a);
         let e = _mm_set1_epi16(0);
         assert_eq_m128i(r, e);
     }

From 89687f2c14b89a0efc8a738687225479721eac2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:32:19 +0100
Subject: [PATCH 092/123] convert `_mm512_srai_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index fa954a64d4..fb9de3db10 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5641,16 +5641,12 @@ pub unsafe fn _mm_maskz_sra_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srai_epi16&expand=5427)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpsraw, imm8 = 1))]
-#[rustc_args_required_const(1)]
-pub unsafe fn _mm512_srai_epi16(a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn _mm512_srai_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
     let a = a.as_i16x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsraiw(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vpsraiw(a, IMM8);
     transmute(r)
 }
 
@@ -14924,7 +14920,7 @@ mod tests {
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_srai_epi16() {
         let a = _mm512_set1_epi16(8);
-        let r = _mm512_srai_epi16(a, 2);
+        let r = _mm512_srai_epi16::<2>(a);
         let e = _mm512_set1_epi16(2);
         assert_eq_m512i(r, e);
     }

From 054acf5e984499504e07f31cdd4ff0f5e3833a6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:33:59 +0100
Subject: [PATCH 093/123] convert `_mm512_mask_srai_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index fb9de3db10..f977225cbb 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5655,16 +5655,16 @@ pub unsafe fn _mm512_srai_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srai_epi16&expand=5425)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpsraw, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_srai_epi16(src: __m512i, k: __mmask32, a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_srai_epi16<const IMM8: u32>(
+    src: __m512i,
+    k: __mmask32,
+    a: __m512i,
+) -> __m512i {
+    static_assert_imm_u8!(IMM8);
     let a = a.as_i16x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsraiw(a, $imm8)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+    let shf = vpsraiw(a, IMM8);
     transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
 }
 
@@ -14928,9 +14928,9 @@ mod tests {
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_mask_srai_epi16() {
         let a = _mm512_set1_epi16(8);
-        let r = _mm512_mask_srai_epi16(a, 0, a, 2);
+        let r = _mm512_mask_srai_epi16::<2>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_srai_epi16(a, 0b11111111_11111111_11111111_11111111, a, 2);
+        let r = _mm512_mask_srai_epi16::<2>(a, 0b11111111_11111111_11111111_11111111, a);
         let e = _mm512_set1_epi16(2);
         assert_eq_m512i(r, e);
     }

From 80294a89db8ef18b9b88745f5219fdae2c14c959 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:35:09 +0100
Subject: [PATCH 094/123] convert `_mm512_maskz_srai_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index f977225cbb..960cb98e2e 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -5673,16 +5673,12 @@ pub unsafe fn _mm512_mask_srai_epi16<const IMM8: u32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srai_epi16&expand=5426)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpsraw, imm8 = 1))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_srai_epi16(k: __mmask32, a: __m512i, imm8: u32) -> __m512i {
+#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_srai_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
+    static_assert_imm_u8!(IMM8);
     let a = a.as_i16x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vpsraiw(a, $imm8)
-        };
-    }
-    let shf = constify_imm8_sae!(imm8, call);
+    let shf = vpsraiw(a, IMM8);
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, shf, zero))
 }
@@ -14938,9 +14934,9 @@ mod tests {
     #[simd_test(enable = "avx512bw")]
     unsafe fn test_mm512_maskz_srai_epi16() {
         let a = _mm512_set1_epi16(8);
-        let r = _mm512_maskz_srai_epi16(0, a, 2);
+        let r = _mm512_maskz_srai_epi16::<2>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_srai_epi16(0b11111111_11111111_11111111_11111111, a, 2);
+        let r = _mm512_maskz_srai_epi16::<2>(0b11111111_11111111_11111111_11111111, a);
         let e = _mm512_set1_epi16(2);
         assert_eq_m512i(r, e);
     }

From 4c0fec9b0e57b4123ae198b3182fb53dd392d572 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:42:07 +0100
Subject: [PATCH 095/123] convert `_mm512_mask_shufflelo_epi16` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 960cb98e2e..3348dff88a 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7194,20 +7194,15 @@ pub unsafe fn _mm512_shufflelo_epi16(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shufflelo_epi16&expand=5219)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 0))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_shufflelo_epi16(
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_shufflelo_epi16<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shufflelo_epi16(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(IMM8);
+    let r = _mm512_shufflelo_epi16(a, IMM8);
     transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
 }
 
@@ -16332,10 +16327,13 @@ mod tests {
             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
             16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
         );
-        let r = _mm512_mask_shufflelo_epi16(a, 0, a, 0b00_01_01_11);
+        let r = _mm512_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r =
-            _mm512_mask_shufflelo_epi16(a, 0b11111111_11111111_11111111_11111111, a, 0b00_01_01_11);
+        let r = _mm512_mask_shufflelo_epi16::<0b00_01_01_11>(
+            a,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
         #[rustfmt::skip]
         let e = _mm512_set_epi16(
             0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,

From 61990f6f7fd4de14b7c36c74512e7db365310213 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:43:26 +0100
Subject: [PATCH 096/123] convert `_mm512_maskz_shufflelo_epi16` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 3348dff88a..7ecbc8a9ef 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7211,15 +7211,11 @@ pub unsafe fn _mm512_mask_shufflelo_epi16<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shufflelo_epi16&expand=5220)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 0))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_shufflelo_epi16(k: __mmask32, a: __m512i, imm8: i32) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shufflelo_epi16(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_shufflelo_epi16(a, IMM8);
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, r.as_i16x32(), zero))
 }
@@ -16349,10 +16345,10 @@ mod tests {
             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
             16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
         );
-        let r = _mm512_maskz_shufflelo_epi16(0, a, 0b00_01_01_11);
+        let r = _mm512_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
         let r =
-            _mm512_maskz_shufflelo_epi16(0b11111111_11111111_11111111_11111111, a, 0b00_01_01_11);
+            _mm512_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111_11111111_11111111_11111111, a);
         #[rustfmt::skip]
         let e = _mm512_set_epi16(
             0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12,

From 02901748dbc281eb9e833edbfe8c53177c20006e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:44:34 +0100
Subject: [PATCH 097/123] convert `_mm256_mask_shufflelo_epi16` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 7ecbc8a9ef..80290225bd 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7225,20 +7225,15 @@ pub unsafe fn _mm512_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask32, a: __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shufflelo_epi16&expand=5216)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_shufflelo_epi16(
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_shufflelo_epi16<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shufflelo_epi16(a, $imm8)
-        };
-    }
-    let shuffle = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm256_shufflelo_epi16(a, IMM8);
     transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
 }
 
@@ -16360,9 +16355,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm256_mask_shufflelo_epi16() {
         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_mask_shufflelo_epi16(a, 0, a, 0b00_01_01_11);
+        let r = _mm256_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shufflelo_epi16(a, 0b11111111_11111111, a, 0b00_01_01_11);
+        let r = _mm256_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0b11111111_11111111, a);
         let e = _mm256_set_epi16(0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12);
         assert_eq_m256i(r, e);
     }

From b7bc560d7e7653b230180d1cc464054dc857b9a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:45:15 +0100
Subject: [PATCH 098/123] convert `_mm256_maskz_shufflelo_epi16` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 80290225bd..716e14da36 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7242,15 +7242,11 @@ pub unsafe fn _mm256_mask_shufflelo_epi16<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shufflelo_epi16&expand=5217)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_shufflelo_epi16(k: __mmask16, a: __m256i, imm8: i32) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shufflelo_epi16(a, $imm8)
-        };
-    }
-    let shuffle = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm256_shufflelo_epi16(a, IMM8);
     let zero = _mm256_setzero_si256().as_i16x16();
     transmute(simd_select_bitmask(k, shuffle.as_i16x16(), zero))
 }
@@ -16365,9 +16361,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm256_maskz_shufflelo_epi16() {
         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_shufflelo_epi16(0, a, 0b00_01_01_11);
+        let r = _mm256_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shufflelo_epi16(0b11111111_11111111, a, 0b00_01_01_11);
+        let r = _mm256_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111_11111111, a);
         let e = _mm256_set_epi16(0, 1, 2, 3, 7, 6, 6, 4, 8, 9, 10, 11, 15, 14, 14, 12);
         assert_eq_m256i(r, e);
     }

From 121b417e1556ab2c812bac07bf4183d49356da35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:46:48 +0100
Subject: [PATCH 099/123] convert `_mm_mask_shufflelo_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 716e14da36..48647ec156 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7256,20 +7256,15 @@ pub unsafe fn _mm256_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask16, a: __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shufflelo_epi16&expand=5213)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_shufflelo_epi16(
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_shufflelo_epi16<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_shufflelo_epi16::<$imm8>(a)
-        };
-    }
-    let shuffle = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
     transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
 }
 
@@ -16371,9 +16366,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm_mask_shufflelo_epi16() {
         let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_mask_shufflelo_epi16(a, 0, a, 0b00_01_01_11);
+        let r = _mm_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shufflelo_epi16(a, 0b11111111, a, 0b00_01_01_11);
+        let r = _mm_mask_shufflelo_epi16::<0b00_01_01_11>(a, 0b11111111, a);
         let e = _mm_set_epi16(0, 1, 2, 3, 7, 6, 6, 4);
         assert_eq_m128i(r, e);
     }

From 0fa05ec8f3538d746a4f40e9d58b09a90e9dd9ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:48:07 +0100
Subject: [PATCH 100/123] convert `_mm_maskz_shufflelo_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 48647ec156..5ae8f1041b 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7273,15 +7273,11 @@ pub unsafe fn _mm_mask_shufflelo_epi16<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shufflelo_epi16&expand=5214)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_shufflelo_epi16(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_shufflelo_epi16::<$imm8>(a)
-        };
-    }
-    let shuffle = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
     let zero = _mm_setzero_si128().as_i16x8();
     transmute(simd_select_bitmask(k, shuffle.as_i16x8(), zero))
 }
@@ -16376,9 +16372,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm_maskz_shufflelo_epi16() {
         let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_maskz_shufflelo_epi16(0, a, 0b00_01_01_11);
+        let r = _mm_maskz_shufflelo_epi16::<0b00_01_01_11>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shufflelo_epi16(0b11111111, a, 0b00_01_01_11);
+        let r = _mm_maskz_shufflelo_epi16::<0b00_01_01_11>(0b11111111, a);
         let e = _mm_set_epi16(0, 1, 2, 3, 7, 6, 6, 4);
         assert_eq_m128i(r, e);
     }

From b893ded37a3d0e3d551cbf3af9b7f41891d923dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:49:19 +0100
Subject: [PATCH 101/123] convert `_mm512_mask_shufflehi_epi16` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 5ae8f1041b..7d70e463da 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7345,20 +7345,15 @@ pub unsafe fn _mm512_shufflehi_epi16(a: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shufflehi_epi16&expand=5210)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 0))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_mask_shufflehi_epi16(
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_mask_shufflehi_epi16<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shufflehi_epi16(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(IMM8);
+    let r = _mm512_shufflehi_epi16(a, IMM8);
     transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
 }
 
@@ -16402,10 +16397,13 @@ mod tests {
             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
             16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
         );
-        let r = _mm512_mask_shufflehi_epi16(a, 0, a, 0b00_01_01_11);
+        let r = _mm512_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
         assert_eq_m512i(r, a);
-        let r =
-            _mm512_mask_shufflehi_epi16(a, 0b11111111_11111111_11111111_11111111, a, 0b00_01_01_11);
+        let r = _mm512_mask_shufflehi_epi16::<0b00_01_01_11>(
+            a,
+            0b11111111_11111111_11111111_11111111,
+            a,
+        );
         #[rustfmt::skip]
         let e = _mm512_set_epi16(
             3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,

From 9ccb5d0a70fa8a95b0c1676f9478ef9f29fd6031 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:50:18 +0100
Subject: [PATCH 102/123] convert `_mm512_maskz_shufflehi_epi16` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 7d70e463da..847c6b2e65 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7362,15 +7362,11 @@ pub unsafe fn _mm512_mask_shufflehi_epi16<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shufflehi_epi16&expand=5211)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 0))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm512_maskz_shufflehi_epi16(k: __mmask32, a: __m512i, imm8: i32) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_shufflehi_epi16(a, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm512_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_shufflehi_epi16(a, IMM8);
     let zero = _mm512_setzero_si512().as_i16x32();
     transmute(simd_select_bitmask(k, r.as_i16x32(), zero))
 }
@@ -16419,10 +16415,10 @@ mod tests {
             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
             16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
         );
-        let r = _mm512_maskz_shufflehi_epi16(0, a, 0b00_01_01_11);
+        let r = _mm512_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
         assert_eq_m512i(r, _mm512_setzero_si512());
         let r =
-            _mm512_maskz_shufflehi_epi16(0b11111111_11111111_11111111_11111111, a, 0b00_01_01_11);
+            _mm512_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111_11111111_11111111_11111111, a);
         #[rustfmt::skip]
         let e = _mm512_set_epi16(
             3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15,

From 492e85844c38452265b8c9dbfb28022b87064111 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:51:41 +0100
Subject: [PATCH 103/123] convert `_mm256_mask_shufflehi_epi16` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 847c6b2e65..fc957f54a9 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7376,20 +7376,15 @@ pub unsafe fn _mm512_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask32, a: __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shufflehi_epi16&expand=5207)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm256_mask_shufflehi_epi16(
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm256_mask_shufflehi_epi16<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shufflehi_epi16(a, $imm8)
-        };
-    }
-    let shuffle = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm256_shufflehi_epi16(a, IMM8);
     transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
 }
 
@@ -16430,9 +16425,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm256_mask_shufflehi_epi16() {
         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_mask_shufflehi_epi16(a, 0, a, 0b00_01_01_11);
+        let r = _mm256_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_shufflehi_epi16(a, 0b11111111_11111111, a, 0b00_01_01_11);
+        let r = _mm256_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0b11111111_11111111, a);
         let e = _mm256_set_epi16(3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15);
         assert_eq_m256i(r, e);
     }

From 1a8977846a1cc1322d52dd859aecfb929cc7d767 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:52:25 +0100
Subject: [PATCH 104/123] convert `_mm256_maskz_shufflehi_epi16` to const
 generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index fc957f54a9..c40a0db161 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7393,15 +7393,11 @@ pub unsafe fn _mm256_mask_shufflehi_epi16<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shufflehi_epi16&expand=5208)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm256_maskz_shufflehi_epi16(k: __mmask16, a: __m256i, imm8: i32) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_shufflehi_epi16(a, $imm8)
-        };
-    }
-    let shuffle = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm256_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm256_shufflehi_epi16(a, IMM8);
     let zero = _mm256_setzero_si256().as_i16x16();
     transmute(simd_select_bitmask(k, shuffle.as_i16x16(), zero))
 }
@@ -16435,9 +16431,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm256_maskz_shufflehi_epi16() {
         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r = _mm256_maskz_shufflehi_epi16(0, a, 0b00_01_01_11);
+        let r = _mm256_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_shufflehi_epi16(0b11111111_11111111, a, 0b00_01_01_11);
+        let r = _mm256_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111_11111111, a);
         let e = _mm256_set_epi16(3, 2, 2, 0, 4, 5, 6, 7, 11, 10, 10, 8, 12, 13, 14, 15);
         assert_eq_m256i(r, e);
     }

From a6df7821c7b85a69cba3bd60640cf1ccab65e43c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:53:30 +0100
Subject: [PATCH 105/123] convert `_mm_mask_shufflehi_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index c40a0db161..d9d04de63f 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7407,20 +7407,15 @@ pub unsafe fn _mm256_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask16, a: __m
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shufflehi_epi16&expand=5204)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 5))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_mask_shufflehi_epi16(
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm_mask_shufflehi_epi16<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_shufflehi_epi16::<$imm8>(a)
-        };
-    }
-    let shuffle = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
     transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
 }
 
@@ -16441,9 +16436,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm_mask_shufflehi_epi16() {
         let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_mask_shufflehi_epi16(a, 0, a, 0b00_01_01_11);
+        let r = _mm_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0, a);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_shufflehi_epi16(a, 0b11111111, a, 0b00_01_01_11);
+        let r = _mm_mask_shufflehi_epi16::<0b00_01_01_11>(a, 0b11111111, a);
         let e = _mm_set_epi16(3, 2, 2, 0, 4, 5, 6, 7);
         assert_eq_m128i(r, e);
     }

From 7c6bb9a5d6355517b8c451336be09f063a79e2d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:54:29 +0100
Subject: [PATCH 106/123] convert `_mm_maskz_shufflehi_epi16` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index d9d04de63f..24a63bbcb2 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7424,15 +7424,11 @@ pub unsafe fn _mm_mask_shufflehi_epi16<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shufflehi_epi16&expand=5205)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 5))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_maskz_shufflehi_epi16(k: __mmask8, a: __m128i, imm8: i32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_shufflehi_epi16::<$imm8>(a)
-        };
-    }
-    let shuffle = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn _mm_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
     let zero = _mm_setzero_si128().as_i16x8();
     transmute(simd_select_bitmask(k, shuffle.as_i16x8(), zero))
 }
@@ -16446,9 +16442,9 @@ mod tests {
     #[simd_test(enable = "avx512bw,avx512vl")]
     unsafe fn test_mm_maskz_shufflehi_epi16() {
         let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-        let r = _mm_maskz_shufflehi_epi16(0, a, 0b00_01_01_11);
+        let r = _mm_maskz_shufflehi_epi16::<0b00_01_01_11>(0, a);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_shufflehi_epi16(0b11111111, a, 0b00_01_01_11);
+        let r = _mm_maskz_shufflehi_epi16::<0b00_01_01_11>(0b11111111, a);
         let e = _mm_set_epi16(3, 2, 2, 0, 4, 5, 6, 7);
         assert_eq_m128i(r, e);
     }

From bee706eb644eab14db5bf6c1b9daf9a1867964bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:56:11 +0100
Subject: [PATCH 107/123] convert `_mm512_dbsad_epu8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 24a63bbcb2..fdbde910c7 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7865,17 +7865,13 @@ pub unsafe fn _mm512_sad_epu8(a: __m512i, b: __m512i) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_dbsad_epu8&expand=2114)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, imm8 = 0))]
-pub unsafe fn _mm512_dbsad_epu8(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm512_dbsad_epu8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vdbpsadbw(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vdbpsadbw(a, b, IMM8);
     transmute(r)
 }
 
@@ -16870,7 +16866,7 @@ mod tests {
     unsafe fn test_mm512_dbsad_epu8() {
         let a = _mm512_set1_epi8(2);
         let b = _mm512_set1_epi8(4);
-        let r = _mm512_dbsad_epu8(a, b, 0);
+        let r = _mm512_dbsad_epu8::<0>(a, b);
         let e = _mm512_set1_epi16(8);
         assert_eq_m512i(r, e);
     }

From 3c2225a7d59c66acb9c79c1e510368a6a5634c40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:57:08 +0100
Subject: [PATCH 108/123] convert `_mm512_mask_dbsad_epu8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index fdbde910c7..e868e00349 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7880,23 +7880,18 @@ pub unsafe fn _mm512_dbsad_epu8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m5
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_dbsad_epu8&expand=2115)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(4)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, imm8 = 0))]
-pub unsafe fn _mm512_mask_dbsad_epu8(
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm512_mask_dbsad_epu8<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vdbpsadbw(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vdbpsadbw(a, b, IMM8);
     transmute(simd_select_bitmask(k, r, src.as_u16x32()))
 }
 
@@ -16876,9 +16871,9 @@ mod tests {
         let src = _mm512_set1_epi16(1);
         let a = _mm512_set1_epi8(2);
         let b = _mm512_set1_epi8(4);
-        let r = _mm512_mask_dbsad_epu8(src, 0, a, b, 0);
+        let r = _mm512_mask_dbsad_epu8::<0>(src, 0, a, b);
         assert_eq_m512i(r, src);
-        let r = _mm512_mask_dbsad_epu8(src, 0b11111111_11111111_11111111_11111111, a, b, 0);
+        let r = _mm512_mask_dbsad_epu8::<0>(src, 0b11111111_11111111_11111111_11111111, a, b);
         let e = _mm512_set1_epi16(8);
         assert_eq_m512i(r, e);
     }

From b431f969a9816365c93112dd3bd2acc6d0294a82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:58:10 +0100
Subject: [PATCH 109/123] convert `_mm512_maskz_dbsad_epu8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index e868e00349..4247eeb13e 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7900,17 +7900,17 @@ pub unsafe fn _mm512_mask_dbsad_epu8<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_dbsad_epu8&expand=2116)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, imm8 = 0))]
-pub unsafe fn _mm512_maskz_dbsad_epu8(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm512_maskz_dbsad_epu8<const IMM8: i32>(
+    k: __mmask32,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
     let a = a.as_u8x64();
     let b = b.as_u8x64();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vdbpsadbw(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vdbpsadbw(a, b, IMM8);
     transmute(simd_select_bitmask(
         k,
         r,
@@ -16882,9 +16882,9 @@ mod tests {
     unsafe fn test_mm512_maskz_dbsad_epu8() {
         let a = _mm512_set1_epi8(2);
         let b = _mm512_set1_epi8(4);
-        let r = _mm512_maskz_dbsad_epu8(0, a, b, 0);
+        let r = _mm512_maskz_dbsad_epu8::<0>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_dbsad_epu8(0b11111111_11111111_11111111_11111111, a, b, 0);
+        let r = _mm512_maskz_dbsad_epu8::<0>(0b11111111_11111111_11111111_11111111, a, b);
         let e = _mm512_set1_epi16(8);
         assert_eq_m512i(r, e);
     }

From b9629ebbb956ebbbc9ad786cd2a589e6522fa113 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:58:50 +0100
Subject: [PATCH 110/123] convert `_mm256_dbsad_epu8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 4247eeb13e..49b9864b67 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7923,17 +7923,13 @@ pub unsafe fn _mm512_maskz_dbsad_epu8<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dbsad_epu8&expand=2111)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, imm8 = 0))]
-pub unsafe fn _mm256_dbsad_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm256_dbsad_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vdbpsadbw256(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vdbpsadbw256(a, b, IMM8);
     transmute(r)
 }
 
@@ -16893,7 +16889,7 @@ mod tests {
     unsafe fn test_mm256_dbsad_epu8() {
         let a = _mm256_set1_epi8(2);
         let b = _mm256_set1_epi8(4);
-        let r = _mm256_dbsad_epu8(a, b, 0);
+        let r = _mm256_dbsad_epu8::<0>(a, b);
         let e = _mm256_set1_epi16(8);
         assert_eq_m256i(r, e);
     }

From ad1145d4d86bd35d3d543cf40d4fd7b67f810c0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 00:59:38 +0100
Subject: [PATCH 111/123] convert `_mm256_mask_dbsad_epu8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 49b9864b67..efcc6fc186 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7938,23 +7938,18 @@ pub unsafe fn _mm256_dbsad_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m2
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_dbsad_epu8&expand=2112)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(4)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, imm8 = 0))]
-pub unsafe fn _mm256_mask_dbsad_epu8(
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm256_mask_dbsad_epu8<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vdbpsadbw256(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vdbpsadbw256(a, b, IMM8);
     transmute(simd_select_bitmask(k, r, src.as_u16x16()))
 }
 
@@ -16899,9 +16894,9 @@ mod tests {
         let src = _mm256_set1_epi16(1);
         let a = _mm256_set1_epi8(2);
         let b = _mm256_set1_epi8(4);
-        let r = _mm256_mask_dbsad_epu8(src, 0, a, b, 0);
+        let r = _mm256_mask_dbsad_epu8::<0>(src, 0, a, b);
         assert_eq_m256i(r, src);
-        let r = _mm256_mask_dbsad_epu8(src, 0b11111111_11111111, a, b, 0);
+        let r = _mm256_mask_dbsad_epu8::<0>(src, 0b11111111_11111111, a, b);
         let e = _mm256_set1_epi16(8);
         assert_eq_m256i(r, e);
     }

From 7095318f845394cbf9c0f98fc06508d483d2a55e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 01:00:10 +0100
Subject: [PATCH 112/123] convert `_mm256_maskz_dbsad_epu8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index efcc6fc186..a8cf3db3ff 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7958,17 +7958,13 @@ pub unsafe fn _mm256_mask_dbsad_epu8<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_dbsad_epu8&expand=2113)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, imm8 = 0))]
-pub unsafe fn _mm256_maskz_dbsad_epu8(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    static_assert_imm8!(IMM8);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vdbpsadbw256(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vdbpsadbw256(a, b, IMM8);
     transmute(simd_select_bitmask(
         k,
         r,
@@ -16905,9 +16901,9 @@ mod tests {
     unsafe fn test_mm256_maskz_dbsad_epu8() {
         let a = _mm256_set1_epi8(2);
         let b = _mm256_set1_epi8(4);
-        let r = _mm256_maskz_dbsad_epu8(0, a, b, 0);
+        let r = _mm256_maskz_dbsad_epu8::<0>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_dbsad_epu8(0b11111111_11111111, a, b, 0);
+        let r = _mm256_maskz_dbsad_epu8::<0>(0b11111111_11111111, a, b);
         let e = _mm256_set1_epi16(8);
         assert_eq_m256i(r, e);
     }

From 75bc3ceb94b18cfd3b54c3285d63444f1a215df3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 01:00:51 +0100
Subject: [PATCH 113/123] convert `_mm_dbsad_epu8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index a8cf3db3ff..b95c73fff7 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7960,7 +7960,11 @@ pub unsafe fn _mm256_mask_dbsad_epu8<const IMM8: i32>(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub unsafe fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+pub unsafe fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(
+    k: __mmask16,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
     static_assert_imm8!(IMM8);
     let a = a.as_u8x32();
     let b = b.as_u8x32();
@@ -7977,17 +7981,13 @@ pub unsafe fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask16, a: __m256i,
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dbsad_epu8&expand=2108)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(2)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, imm8 = 0))]
-pub unsafe fn _mm_dbsad_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+#[rustc_legacy_const_generics(2)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm_dbsad_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vdbpsadbw128(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vdbpsadbw128(a, b, IMM8);
     transmute(r)
 }
 
@@ -16912,7 +16912,7 @@ mod tests {
     unsafe fn test_mm_dbsad_epu8() {
         let a = _mm_set1_epi8(2);
         let b = _mm_set1_epi8(4);
-        let r = _mm_dbsad_epu8(a, b, 0);
+        let r = _mm_dbsad_epu8::<0>(a, b);
         let e = _mm_set1_epi16(8);
         assert_eq_m128i(r, e);
     }

From c9f5a2ed35ff2f84524548d78491100d6e582158 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 01:01:56 +0100
Subject: [PATCH 114/123] convert `_mm_mask_dbsad_epu8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index b95c73fff7..034cdb8745 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -7996,23 +7996,18 @@ pub unsafe fn _mm_dbsad_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_dbsad_epu8&expand=2109)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(4)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, imm8 = 0))]
-pub unsafe fn _mm_mask_dbsad_epu8(
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm_mask_dbsad_epu8<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vdbpsadbw128(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vdbpsadbw128(a, b, IMM8);
     transmute(simd_select_bitmask(k, r, src.as_u16x8()))
 }
 
@@ -16922,9 +16917,9 @@ mod tests {
         let src = _mm_set1_epi16(1);
         let a = _mm_set1_epi8(2);
         let b = _mm_set1_epi8(4);
-        let r = _mm_mask_dbsad_epu8(src, 0, a, b, 0);
+        let r = _mm_mask_dbsad_epu8::<0>(src, 0, a, b);
         assert_eq_m128i(r, src);
-        let r = _mm_mask_dbsad_epu8(src, 0b11111111, a, b, 0);
+        let r = _mm_mask_dbsad_epu8::<0>(src, 0b11111111, a, b);
         let e = _mm_set1_epi16(8);
         assert_eq_m128i(r, e);
     }

From 79fd47b740ce4de5240c93b71938c1421f062035 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 01:02:34 +0100
Subject: [PATCH 115/123] convert `_mm_maskz_dbsad_epu8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 034cdb8745..d0dee28c95 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -8016,17 +8016,17 @@ pub unsafe fn _mm_mask_dbsad_epu8<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_dbsad_epu8&expand=2110)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vdbpsadbw, imm8 = 0))]
-pub unsafe fn _mm_maskz_dbsad_epu8(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
+pub unsafe fn _mm_maskz_dbsad_epu8<const IMM8: i32>(
+    k: __mmask8,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
     let a = a.as_u8x16();
     let b = b.as_u8x16();
-    macro_rules! call {
-        ($imm8:expr) => {
-            vdbpsadbw128(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    let r = vdbpsadbw128(a, b, IMM8);
     transmute(simd_select_bitmask(k, r, _mm_setzero_si128().as_u16x8()))
 }
 
@@ -16928,9 +16928,9 @@ mod tests {
     unsafe fn test_mm_maskz_dbsad_epu8() {
         let a = _mm_set1_epi8(2);
         let b = _mm_set1_epi8(4);
-        let r = _mm_maskz_dbsad_epu8(0, a, b, 0);
+        let r = _mm_maskz_dbsad_epu8::<0>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_dbsad_epu8(0b11111111, a, b, 0);
+        let r = _mm_maskz_dbsad_epu8::<0>(0b11111111, a, b);
         let e = _mm_set1_epi16(8);
         assert_eq_m128i(r, e);
     }

From 7c1c2a8fa859904e56790397023adea2d80f0044 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 01:04:24 +0100
Subject: [PATCH 116/123] convert `_mm512_mask_alignr_epi8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index d0dee28c95..0b7e564eb4 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -9056,21 +9056,16 @@ pub unsafe fn _mm512_alignr_epi8(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_alignr_epi8&expand=264)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpalignr, imm8 = 1))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm512_mask_alignr_epi8(
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(4)]
+pub unsafe fn _mm512_mask_alignr_epi8<const IMM8: i32>(
     src: __m512i,
     k: __mmask64,
     a: __m512i,
     b: __m512i,
-    imm8: i32,
 ) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_alignr_epi8(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(IMM8);
+    let r = _mm512_alignr_epi8(a, b, IMM8);
     transmute(simd_select_bitmask(k, r.as_i8x64(), src.as_i8x64()))
 }
 
@@ -17705,14 +17700,13 @@ mod tests {
             1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
         );
         let b = _mm512_set1_epi8(1);
-        let r = _mm512_mask_alignr_epi8(a, 0, a, b, 14);
+        let r = _mm512_mask_alignr_epi8::<14>(a, 0, a, b);
         assert_eq_m512i(r, a);
-        let r = _mm512_mask_alignr_epi8(
+        let r = _mm512_mask_alignr_epi8::<14>(
             a,
             0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
             a,
             b,
-            14,
         );
         #[rustfmt::skip]
         let e = _mm512_set_epi8(

From 7ae78308ffd6e4f72bfbd65c2c685d969b510bfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 01:06:07 +0100
Subject: [PATCH 117/123] convert `_mm512_maskz_alignr_epi8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 0b7e564eb4..6438b7196b 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -9074,15 +9074,15 @@ pub unsafe fn _mm512_mask_alignr_epi8<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_alignr_epi8&expand=265)
 #[inline]
 #[target_feature(enable = "avx512bw")]
-#[cfg_attr(test, assert_instr(vpalignr, imm8 = 1))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm512_maskz_alignr_epi8(k: __mmask64, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm512_alignr_epi8(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn _mm512_maskz_alignr_epi8<const IMM8: i32>(
+    k: __mmask64,
+    a: __m512i,
+    b: __m512i,
+) -> __m512i {
+    static_assert_imm8!(IMM8);
+    let r = _mm512_alignr_epi8(a, b, IMM8);
     let zero = _mm512_setzero_si512().as_i8x64();
     transmute(simd_select_bitmask(k, r.as_i8x64(), zero))
 }
@@ -17728,13 +17728,12 @@ mod tests {
             1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
         );
         let b = _mm512_set1_epi8(1);
-        let r = _mm512_maskz_alignr_epi8(0, a, b, 14);
+        let r = _mm512_maskz_alignr_epi8::<14>(0, a, b);
         assert_eq_m512i(r, _mm512_setzero_si512());
-        let r = _mm512_maskz_alignr_epi8(
+        let r = _mm512_maskz_alignr_epi8::<14>(
             0b11111111_11111111_11111111_11111111_11111111_11111111_11111111_11111111,
             a,
             b,
-            14,
         );
         #[rustfmt::skip]
         let e = _mm512_set_epi8(

From 4773594ebfe9a861c76f583fd32547b04eb9edef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 01:07:00 +0100
Subject: [PATCH 118/123] convert `_mm256_mask_alignr_epi8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 6438b7196b..82c138abf2 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -9092,21 +9092,16 @@ pub unsafe fn _mm512_maskz_alignr_epi8<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_alignr_epi8&expand=261)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(4)]
-#[cfg_attr(test, assert_instr(vpalignr, imm8 = 5))]
-pub unsafe fn _mm256_mask_alignr_epi8(
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub unsafe fn _mm256_mask_alignr_epi8<const IMM8: i32>(
     src: __m256i,
     k: __mmask32,
     a: __m256i,
     b: __m256i,
-    imm8: i32,
 ) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_alignr_epi8(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(IMM8);
+    let r = _mm256_alignr_epi8(a, b, IMM8);
     transmute(simd_select_bitmask(k, r.as_i8x32(), src.as_i8x32()))
 }
 
@@ -17753,9 +17748,9 @@ mod tests {
             1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
         );
         let b = _mm256_set1_epi8(1);
-        let r = _mm256_mask_alignr_epi8(a, 0, a, b, 14);
+        let r = _mm256_mask_alignr_epi8::<14>(a, 0, a, b);
         assert_eq_m256i(r, a);
-        let r = _mm256_mask_alignr_epi8(a, 0b11111111_11111111_11111111_11111111, a, b, 14);
+        let r = _mm256_mask_alignr_epi8::<14>(a, 0b11111111_11111111_11111111_11111111, a, b);
         #[rustfmt::skip]
         let e = _mm256_set_epi8(
             0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,

From 8cee2fb8c675235a59f397286de256a6de5da11d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 01:08:32 +0100
Subject: [PATCH 119/123] convert `_mm256_maskz_alignr_epi8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index 82c138abf2..ccf30f9d96 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -9110,15 +9110,15 @@ pub unsafe fn _mm256_mask_alignr_epi8<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_alignr_epi8&expand=262)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpalignr, imm8 = 5))]
-pub unsafe fn _mm256_maskz_alignr_epi8(k: __mmask32, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm256_alignr_epi8(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub unsafe fn _mm256_maskz_alignr_epi8<const IMM8: i32>(
+    k: __mmask32,
+    a: __m256i,
+    b: __m256i,
+) -> __m256i {
+    static_assert_imm8!(IMM8);
+    let r = _mm256_alignr_epi8(a, b, IMM8);
     transmute(simd_select_bitmask(
         k,
         r.as_i8x32(),
@@ -17767,9 +17767,9 @@ mod tests {
             1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
         );
         let b = _mm256_set1_epi8(1);
-        let r = _mm256_maskz_alignr_epi8(0, a, b, 14);
+        let r = _mm256_maskz_alignr_epi8::<14>(0, a, b);
         assert_eq_m256i(r, _mm256_setzero_si256());
-        let r = _mm256_maskz_alignr_epi8(0b11111111_11111111_11111111_11111111, a, b, 14);
+        let r = _mm256_maskz_alignr_epi8::<14>(0b11111111_11111111_11111111_11111111, a, b);
         #[rustfmt::skip]
         let e = _mm256_set_epi8(
             0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,

From 6ba71df63923dff527fb54da7c8ff74e07e2cb6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 01:09:23 +0100
Subject: [PATCH 120/123] convert `_mm_mask_alignr_epi8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index ccf30f9d96..a98c445f38 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -9131,21 +9131,16 @@ pub unsafe fn _mm256_maskz_alignr_epi8<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_alignr_epi8&expand=258)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(4)]
-#[cfg_attr(test, assert_instr(vpalignr, imm8 = 5))]
-pub unsafe fn _mm_mask_alignr_epi8(
+#[rustc_legacy_const_generics(4)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub unsafe fn _mm_mask_alignr_epi8<const IMM8: i32>(
     src: __m128i,
     k: __mmask16,
     a: __m128i,
     b: __m128i,
-    imm8: i32,
 ) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_alignr_epi8(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+    static_assert_imm8!(IMM8);
+    let r = _mm_alignr_epi8(a, b, IMM8);
     transmute(simd_select_bitmask(k, r.as_i8x16(), src.as_i8x16()))
 }
 
@@ -17782,9 +17777,9 @@ mod tests {
     unsafe fn test_mm_mask_alignr_epi8() {
         let a = _mm_set_epi8(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0);
         let b = _mm_set1_epi8(1);
-        let r = _mm_mask_alignr_epi8(a, 0, a, b, 14);
+        let r = _mm_mask_alignr_epi8::<14>(a, 0, a, b);
         assert_eq_m128i(r, a);
-        let r = _mm_mask_alignr_epi8(a, 0b11111111_11111111, a, b, 14);
+        let r = _mm_mask_alignr_epi8::<14>(a, 0b11111111_11111111, a, b);
         let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
         assert_eq_m128i(r, e);
     }

From 310211182009e6135519072d9ee110a29dd6d80f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 01:10:25 +0100
Subject: [PATCH 121/123] convert `_mm_maskz_alignr_epi8` to const generics

---
 crates/core_arch/src/x86/avx512bw.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index a98c445f38..2128a828ff 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -9149,15 +9149,15 @@ pub unsafe fn _mm_mask_alignr_epi8<const IMM8: i32>(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_alignr_epi8&expand=259)
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
-#[rustc_args_required_const(3)]
-#[cfg_attr(test, assert_instr(vpalignr, imm8 = 5))]
-pub unsafe fn _mm_maskz_alignr_epi8(k: __mmask16, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
-    macro_rules! call {
-        ($imm8:expr) => {
-            _mm_alignr_epi8(a, b, $imm8)
-        };
-    }
-    let r = constify_imm8_sae!(imm8, call);
+#[rustc_legacy_const_generics(3)]
+#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
+pub unsafe fn _mm_maskz_alignr_epi8<const IMM8: i32>(
+    k: __mmask16,
+    a: __m128i,
+    b: __m128i,
+) -> __m128i {
+    static_assert_imm8!(IMM8);
+    let r = _mm_alignr_epi8(a, b, IMM8);
     let zero = _mm_setzero_si128().as_i8x16();
     transmute(simd_select_bitmask(k, r.as_i8x16(), zero))
 }
@@ -17788,9 +17788,9 @@ mod tests {
     unsafe fn test_mm_maskz_alignr_epi8() {
         let a = _mm_set_epi8(1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0);
         let b = _mm_set1_epi8(1);
-        let r = _mm_maskz_alignr_epi8(0, a, b, 14);
+        let r = _mm_maskz_alignr_epi8::<14>(0, a, b);
         assert_eq_m128i(r, _mm_setzero_si128());
-        let r = _mm_maskz_alignr_epi8(0b11111111_11111111, a, b, 14);
+        let r = _mm_maskz_alignr_epi8::<14>(0b11111111_11111111, a, b);
         let e = _mm_set_epi8(0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1);
         assert_eq_m128i(r, e);
     }

From 47c78f7dbc7ce8b92aa164787894f2e15c362af4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Thu, 4 Mar 2021 21:44:53 +0100
Subject: [PATCH 122/123] convert `_xabort` to const generics

---
 crates/core_arch/src/x86/rtm.rs | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/crates/core_arch/src/x86/rtm.rs b/crates/core_arch/src/x86/rtm.rs
index 7cb1cc09bd..dab73cde9e 100644
--- a/crates/core_arch/src/x86/rtm.rs
+++ b/crates/core_arch/src/x86/rtm.rs
@@ -76,15 +76,11 @@ pub unsafe fn _xend() {
 /// [Intel's documentation](https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-xabort).
 #[inline]
 #[target_feature(enable = "rtm")]
-#[cfg_attr(test, assert_instr(xabort, imm8 = 0x0))]
-#[rustc_args_required_const(0)]
-pub unsafe fn _xabort(imm8: u32) {
-    macro_rules! call {
-        ($imm8:expr) => {
-            x86_xabort($imm8)
-        };
-    }
-    constify_imm8!(imm8, call)
+#[cfg_attr(test, assert_instr(xabort, IMM8 = 0x0))]
+#[rustc_legacy_const_generics(0)]
+pub unsafe fn _xabort<const IMM8: u32>() {
+    static_assert_imm_u8!(IMM8);
+    x86_xabort(IMM8 as i8)
 }
 
 /// Queries whether the processor is executing in a transactional region identified by restricted
@@ -130,14 +126,14 @@ mod tests {
     unsafe fn test_xabort() {
         const ABORT_CODE: u32 = 42;
         // aborting outside a transactional region does nothing
-        _xabort(ABORT_CODE);
+        _xabort::<ABORT_CODE>();
 
         for _ in 0..10 {
             let mut x = 0;
             let code = rtm::_xbegin();
             if code == _XBEGIN_STARTED {
                 x += 1;
-                rtm::_xabort(ABORT_CODE);
+                rtm::_xabort::<ABORT_CODE>();
             } else if code & _XABORT_EXPLICIT != 0 {
                 let test_abort_code = rtm::_xabort_code(code);
                 assert_eq!(test_abort_code, ABORT_CODE);

From cf0c158e15e0b5ef619be41e303d09d3c0a70f1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20Rakic?= <remy.rakic+github@gmail.com>
Date: Fri, 5 Mar 2021 03:56:21 +0100
Subject: [PATCH 123/123] temporarily disable WASM CI

The LLVM12 upgrade in rustc may be causing issues
---
 .github/workflows/main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c7cec5a858..615a121b7a 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -77,7 +77,7 @@ jobs:
         - mips64-unknown-linux-gnuabi64
         - mips64el-unknown-linux-gnuabi64
         - s390x-unknown-linux-gnu
-        - wasm32-wasi
+        # - wasm32-wasi
         - i586-unknown-linux-gnu
         - x86_64-linux-android
         - arm-linux-androideabi
@@ -131,8 +131,8 @@ jobs:
           disable_assert_instr: true
         - target: s390x-unknown-linux-gnu
           os: ubuntu-latest
-        - target: wasm32-wasi
-          os: ubuntu-latest
+        # - target: wasm32-wasi
+        #   os: ubuntu-latest
         - target: aarch64-apple-darwin
           os: macos-latest
           norun: true