diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 91c10638e0..3e1530bbb2 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -2778,7 +2778,12 @@ pub fn _mm256_bslli_epi128(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsllvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) } + unsafe { + let count = count.as_u32x4(); + let good: u32x4 = simd_lt(count, u32x4::splat(32)); + let count = simd_select(good, count, u32x4::ZERO); + simd_select(good, simd_shl(a.as_u32x4(), count), u32x4::ZERO).as_m128i() + } } /// Shifts packed 32-bit integers in `a` left by the amount @@ -2791,7 +2796,12 @@ pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsllvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) } + unsafe { + let count = count.as_u32x8(); + let good: u32x8 = simd_lt(count, u32x8::splat(32)); + let count = simd_select(good, count, u32x8::ZERO); + simd_select(good, simd_shl(a.as_u32x8(), count), u32x8::ZERO).as_m256i() + } } /// Shifts packed 64-bit integers in `a` left by the amount @@ -2804,7 +2814,12 @@ pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsllvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) } + unsafe { + let count = count.as_u64x2(); + let good: u64x2 = simd_lt(count, u64x2::splat(64)); + let count = simd_select(good, count, u64x2::ZERO); + simd_select(good, simd_shl(a.as_u64x2(), count), u64x2::ZERO).as_m128i() + } } /// Shifts packed 64-bit integers in `a` left by the amount @@ -2817,7 +2832,12 @@ pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsllvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) } + unsafe { + let count = count.as_u64x4(); + let good: u64x4 = simd_lt(count, u64x4::splat(64)); + let count = simd_select(good, count, u64x4::ZERO); + simd_select(good, simd_shl(a.as_u64x4(), count), u64x4::ZERO).as_m256i() + } } /// Shifts packed 16-bit integers in `a` right by `count` while @@ -2881,7 +2901,12 @@ pub fn _mm256_srai_epi32(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsravd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) } + unsafe { + let count = count.as_u32x4(); + let good: u32x4 = simd_lt(count, u32x4::splat(32)); + let count = simd_select(good, transmute(count), i32x4::splat(31)); + simd_shr(a.as_i32x4(), count).as_m128i() + } } /// Shifts packed 32-bit integers in `a` right by the amount specified by the @@ -2893,7 +2918,12 @@ pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsravd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) } + unsafe { + let count = count.as_u32x8(); + let good: u32x8 = simd_lt(count, u32x8::splat(32)); + let count = simd_select(good, transmute(count), i32x8::splat(31)); + simd_shr(a.as_i32x8(), count).as_m256i() + } } /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. @@ -3076,7 +3106,12 @@ pub fn _mm256_srli_epi64(a: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrlvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) } + unsafe { + let count = count.as_u32x4(); + let good: u32x4 = simd_lt(count, u32x4::splat(32)); + let count = simd_select(good, count, u32x4::ZERO); + simd_select(good, simd_shr(a.as_u32x4(), count), u32x4::ZERO).as_m128i() + } } /// Shifts packed 32-bit integers in `a` right by the amount specified by @@ -3088,7 +3123,12 @@ pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsrlvd))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) } + unsafe { + let count = count.as_u32x8(); + let good: u32x8 = simd_lt(count, u32x8::splat(32)); + let count = simd_select(good, count, u32x8::ZERO); + simd_select(good, simd_shr(a.as_u32x8(), count), u32x8::ZERO).as_m256i() + } } /// Shifts packed 64-bit integers in `a` right by the amount specified by @@ -3100,7 +3140,12 @@ pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vpsrlvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) } + unsafe { + let count = count.as_u64x2(); + let good: u64x2 = simd_lt(count, u64x2::splat(64)); + let count = simd_select(good, count, u64x2::ZERO); + simd_select(good, simd_shr(a.as_u64x2(), count), u64x2::ZERO).as_m128i() + } } /// Shifts packed 64-bit integers in `a` right by the amount specified by @@ -3112,7 +3157,12 @@ pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(vpsrlvq))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) } + unsafe { + let count = count.as_u64x4(); + let good: u64x4 = simd_lt(count, u64x4::splat(64)); + let count = simd_select(good, count, u64x4::ZERO); + simd_select(good, simd_shr(a.as_u64x4(), count), u64x4::ZERO).as_m256i() + } } /// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr @@ -3687,36 +3737,16 @@ unsafe extern "C" { fn pslld(a: i32x8, count: i32x4) -> i32x8; #[link_name = "llvm.x86.avx2.psll.q"] fn psllq(a: i64x4, count: i64x2) -> i64x4; - #[link_name = "llvm.x86.avx2.psllv.d"] - fn psllvd(a: i32x4, count: i32x4) -> i32x4; - #[link_name = "llvm.x86.avx2.psllv.d.256"] - fn psllvd256(a: i32x8, count: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx2.psllv.q"] - fn psllvq(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx2.psllv.q.256"] - fn psllvq256(a: i64x4, count: i64x4) -> i64x4; #[link_name = "llvm.x86.avx2.psra.w"] fn psraw(a: i16x16, count: i16x8) -> i16x16; #[link_name = "llvm.x86.avx2.psra.d"] fn psrad(a: i32x8, count: i32x4) -> i32x8; - #[link_name = "llvm.x86.avx2.psrav.d"] - fn psravd(a: i32x4, count: i32x4) -> i32x4; - #[link_name = "llvm.x86.avx2.psrav.d.256"] - fn psravd256(a: i32x8, count: i32x8) -> i32x8; #[link_name = "llvm.x86.avx2.psrl.w"] fn psrlw(a: i16x16, count: i16x8) -> i16x16; #[link_name = "llvm.x86.avx2.psrl.d"] fn psrld(a: i32x8, count: i32x4) -> i32x8; #[link_name = "llvm.x86.avx2.psrl.q"] fn psrlq(a: i64x4, count: i64x2) -> i64x4; - #[link_name = "llvm.x86.avx2.psrlv.d"] - fn psrlvd(a: i32x4, count: i32x4) -> i32x4; - #[link_name = "llvm.x86.avx2.psrlv.d.256"] - fn psrlvd256(a: i32x8, count: i32x8) -> i32x8; - #[link_name = "llvm.x86.avx2.psrlv.q"] - fn psrlvq(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx2.psrlv.q.256"] - fn psrlvq256(a: i64x4, count: i64x4) -> i64x4; #[link_name = "llvm.x86.avx2.pshuf.b"] fn pshufb(a: u8x32, b: u8x32) -> u8x32; #[link_name = "llvm.x86.avx2.permd"] diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index fadc0e2cc0..ee05da039f 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -6852,7 +6852,12 @@ pub fn _mm_maskz_slli_epi16(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvw))] pub fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsllvw(a.as_i16x32(), count.as_i16x32())) } + unsafe { + let count = count.as_u16x32(); + let good: u16x32 = simd_lt(count, u16x32::splat(16)); + let count = simd_select(good, count, u16x32::ZERO); + simd_select(good, simd_shl(a.as_u16x32(), count), u16x32::ZERO).as_m512i() + } } /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6891,7 +6896,12 @@ pub fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvw))] pub fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16())) } + unsafe { + let count = count.as_u16x16(); + let good: u16x16 = simd_lt(count, u16x16::splat(16)); + let count = simd_select(good, count, u16x16::ZERO); + simd_select(good, simd_shl(a.as_u16x16(), count), u16x16::ZERO).as_m256i() + } } /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6930,7 +6940,12 @@ pub fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvw))] pub fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8())) } + unsafe { + let count = count.as_u16x8(); + let good: u16x8 = simd_lt(count, u16x8::splat(16)); + let count = simd_select(good, count, u16x8::ZERO); + simd_select(good, simd_shl(a.as_u16x8(), count), u16x8::ZERO).as_m128i() + } } /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7188,7 +7203,12 @@ pub fn _mm_maskz_srli_epi16(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvw))] pub fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32())) } + unsafe { + let count = count.as_u16x32(); + let good: u16x32 = simd_lt(count, u16x32::splat(16)); + let count = simd_select(good, count, u16x32::ZERO); + simd_select(good, simd_shr(a.as_u16x32(), count), u16x32::ZERO).as_m512i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7227,7 +7247,12 @@ pub fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvw))] pub fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16())) } + unsafe { + let count = count.as_u16x16(); + let good: u16x16 = simd_lt(count, u16x16::splat(16)); + let count = simd_select(good, count, u16x16::ZERO); + simd_select(good, simd_shr(a.as_u16x16(), count), u16x16::ZERO).as_m256i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7266,7 +7291,12 @@ pub fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvw))] pub fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8())) } + unsafe { + let count = count.as_u16x8(); + let good: u16x8 = simd_lt(count, u16x8::splat(16)); + let count = simd_select(good, count, u16x8::ZERO); + simd_select(good, simd_shr(a.as_u16x8(), count), u16x8::ZERO).as_m128i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7511,7 +7541,12 @@ pub fn _mm_maskz_srai_epi16(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravw))] pub fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsravw(a.as_i16x32(), count.as_i16x32())) } + unsafe { + let count = count.as_u16x32(); + let good: u16x32 = simd_lt(count, u16x32::splat(16)); + let count = simd_select(good, transmute(count), i16x32::splat(15)); + simd_shr(a.as_i16x32(), count).as_m512i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7550,7 +7585,12 @@ pub fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravw))] pub fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsravw256(a.as_i16x16(), count.as_i16x16())) } + unsafe { + let count = count.as_u16x16(); + let good: u16x16 = simd_lt(count, u16x16::splat(16)); + let count = simd_select(good, transmute(count), i16x16::splat(15)); + simd_shr(a.as_i16x16(), count).as_m256i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -7589,7 +7629,12 @@ pub fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravw))] pub fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsravw128(a.as_i16x8(), count.as_i16x8())) } + unsafe { + let count = count.as_u16x8(); + let good: u16x8 = simd_lt(count, u16x8::splat(16)); + let count = simd_select(good, transmute(count), i16x8::splat(15)); + simd_shr(a.as_i16x8(), count).as_m128i() + } } /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -11645,33 +11690,12 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.psll.w.512"] fn vpsllw(a: i16x32, count: i16x8) -> i16x32; - #[link_name = "llvm.x86.avx512.psllv.w.512"] - fn vpsllvw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.psllv.w.256"] - fn vpsllvw256(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx512.psllv.w.128"] - fn vpsllvw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.avx512.psrl.w.512"] fn vpsrlw(a: i16x32, count: i16x8) -> i16x32; - #[link_name = "llvm.x86.avx512.psrlv.w.512"] - fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.psrlv.w.256"] - fn vpsrlvw256(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx512.psrlv.w.128"] - fn vpsrlvw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.avx512.psra.w.512"] fn vpsraw(a: i16x32, count: i16x8) -> i16x32; - #[link_name = "llvm.x86.avx512.psrav.w.512"] - fn vpsravw(a: i16x32, count: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.psrav.w.256"] - fn vpsravw256(a: i16x16, count: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx512.psrav.w.128"] - fn vpsravw128(a: i16x8, count: i16x8) -> i16x8; - #[link_name = "llvm.x86.avx512.vpermi2var.hi.512"] fn vpermi2w(a: i16x32, idx: i16x32, b: i16x32) -> i16x32; #[link_name = "llvm.x86.avx512.vpermi2var.hi.256"] diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index b60df7dbc9..f146fc6ad9 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -20940,7 +20940,12 @@ pub fn _mm_maskz_srai_epi64(k: __mmask8, a: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravd))] pub fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsravd(a.as_i32x16(), count.as_i32x16())) } + unsafe { + let count = count.as_u32x16(); + let good: u32x16 = simd_lt(count, u32x16::splat(32)); + let count = simd_select(good, transmute(count), i32x16::splat(31)); + simd_shr(a.as_i32x16(), count).as_m512i() + } } /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21035,7 +21040,12 @@ pub fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravq))] pub fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsravq(a.as_i64x8(), count.as_i64x8())) } + unsafe { + let count = count.as_u64x8(); + let good: u64x8 = simd_lt(count, u64x8::splat(64)); + let count = simd_select(good, transmute(count), i64x8::splat(63)); + simd_shr(a.as_i64x8(), count).as_m512i() + } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21074,7 +21084,12 @@ pub fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m51 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravq))] pub fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i { - unsafe { transmute(vpsravq256(a.as_i64x4(), count.as_i64x4())) } + unsafe { + let count = count.as_u64x4(); + let good: u64x4 = simd_lt(count, u64x4::splat(64)); + let count = simd_select(good, transmute(count), i64x4::splat(63)); + simd_shr(a.as_i64x4(), count).as_m256i() + } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21113,7 +21128,12 @@ pub fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m25 #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsravq))] pub fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i { - unsafe { transmute(vpsravq128(a.as_i64x2(), count.as_i64x2())) } + unsafe { + let count = count.as_u64x2(); + let good: u64x2 = simd_lt(count, u64x2::splat(64)); + let count = simd_select(good, transmute(count), i64x2::splat(63)); + simd_shr(a.as_i64x2(), count).as_m128i() + } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21692,7 +21712,12 @@ pub fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvd))] pub fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsllvd(a.as_i32x16(), count.as_i32x16())) } + unsafe { + let count = count.as_u32x16(); + let good: u32x16 = simd_lt(count, u32x16::splat(32)); + let count = simd_select(good, count, u32x16::ZERO); + simd_select(good, simd_shl(a.as_u32x16(), count), u32x16::ZERO).as_m512i() + } } /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21787,7 +21812,12 @@ pub fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvd))] pub fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16())) } + unsafe { + let count = count.as_u32x16(); + let good: u32x16 = simd_lt(count, u32x16::splat(32)); + let count = simd_select(good, count, u32x16::ZERO); + simd_select(good, simd_shr(a.as_u32x16(), count), u32x16::ZERO).as_m512i() + } } /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21882,7 +21912,12 @@ pub fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsllvq))] pub fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsllvq(a.as_i64x8(), count.as_i64x8())) } + unsafe { + let count = count.as_u64x8(); + let good: u64x8 = simd_lt(count, u64x8::splat(64)); + let count = simd_select(good, count, u64x8::ZERO); + simd_select(good, simd_shl(a.as_u64x8(), count), u64x8::ZERO).as_m512i() + } } /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -21977,7 +22012,12 @@ pub fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpsrlvq))] pub fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i { - unsafe { transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8())) } + unsafe { + let count = count.as_u64x8(); + let good: u64x8 = simd_lt(count, u64x8::splat(64)); + let count = simd_select(good, count, u64x8::ZERO); + simd_select(good, simd_shr(a.as_u64x8(), count), u64x8::ZERO).as_m512i() + } } /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -42833,15 +42873,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"] fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8; - #[link_name = "llvm.x86.avx512.psllv.d.512"] - fn vpsllvd(a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.psrlv.d.512"] - fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16; - #[link_name = "llvm.x86.avx512.psllv.q.512"] - fn vpsllvq(a: i64x8, b: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.psrlv.q.512"] - fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.psll.d.512"] fn vpslld(a: i32x16, count: i32x4) -> i32x16; #[link_name = "llvm.x86.avx512.psrl.d.512"] @@ -42861,16 +42892,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.psra.q.128"] fn vpsraq128(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx512.psrav.d.512"] - fn vpsravd(a: i32x16, count: i32x16) -> i32x16; - - #[link_name = "llvm.x86.avx512.psrav.q.512"] - fn vpsravq(a: i64x8, count: i64x8) -> i64x8; - #[link_name = "llvm.x86.avx512.psrav.q.256"] - fn vpsravq256(a: i64x4, count: i64x4) -> i64x4; - #[link_name = "llvm.x86.avx512.psrav.q.128"] - fn vpsravq128(a: i64x2, count: i64x2) -> i64x2; - #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"] fn vpermilps(a: f32x16, b: i32x16) -> f32x16; #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"]