ppv_lite86/x86_64/
sse2.rs

1use crate::soft::{x2, x4};
2use crate::types::*;
3use crate::vec128_storage;
4use crate::x86_64::Avx2Machine;
5use crate::x86_64::SseMachine as Machine86;
6use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7use core::arch::x86_64::*;
8use core::marker::PhantomData;
9use core::ops::{
10    Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11};
12use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes};
13
14macro_rules! impl_binop {
15    ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
16        impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
17            type Output = Self;
18            #[inline(always)]
19            fn $fn(self, rhs: Self) -> Self::Output {
20                Self::new(unsafe { $impl_fn(self.x, rhs.x) })
21            }
22        }
23    };
24}
25
26macro_rules! impl_binop_assign {
27    ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
28        impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
29        where
30            $vec<S3, S4, NI>: Copy,
31        {
32            #[inline(always)]
33            fn $fn_assign(&mut self, rhs: Self) {
34                *self = self.$fn(rhs);
35            }
36        }
37    };
38}
39
40macro_rules! def_vec {
41    ($vec:ident, $word:ident) => {
42        #[allow(non_camel_case_types)]
43        #[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
44        #[repr(transparent)]
45        pub struct $vec<S3, S4, NI> {
46            x: __m128i,
47            s3: PhantomData<S3>,
48            s4: PhantomData<S4>,
49            ni: PhantomData<NI>,
50        }
51
52        impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
53            #[inline(always)]
54            unsafe fn unpack(x: vec128_storage) -> Self {
55                Self::new(x.sse2)
56            }
57        }
58        impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
59            #[inline(always)]
60            fn from(x: $vec<S3, S4, NI>) -> Self {
61                vec128_storage { sse2: x.x }
62            }
63        }
64        impl<S3, S4, NI> $vec<S3, S4, NI> {
65            #[inline(always)]
66            fn new(x: __m128i) -> Self {
67                $vec {
68                    x,
69                    s3: PhantomData,
70                    s4: PhantomData,
71                    ni: PhantomData,
72                }
73            }
74        }
75
76        impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
77        where
78            Self: BSwap,
79        {
80            #[inline(always)]
81            unsafe fn unsafe_read_le(input: &[u8]) -> Self {
82                assert_eq!(input.len(), 16);
83                Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
84            }
85            #[inline(always)]
86            unsafe fn unsafe_read_be(input: &[u8]) -> Self {
87                assert_eq!(input.len(), 16);
88                Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
89            }
90            #[inline(always)]
91            fn write_le(self, out: &mut [u8]) {
92                assert_eq!(out.len(), 16);
93                unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
94            }
95            #[inline(always)]
96            fn write_be(self, out: &mut [u8]) {
97                assert_eq!(out.len(), 16);
98                let x = self.bswap().x;
99                unsafe {
100                    _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
101                }
102            }
103        }
104
105        impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
106            #[inline(always)]
107            fn default() -> Self {
108                Self::new(unsafe { _mm_setzero_si128() })
109            }
110        }
111
112        impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
113            type Output = Self;
114            #[inline(always)]
115            fn not(self) -> Self::Output {
116                unsafe {
117                    let ff = _mm_set1_epi64x(-1i64);
118                    self ^ Self::new(ff)
119                }
120            }
121        }
122
123        impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
124        impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
125        impl_binop!($vec, BitOr, bitor, _mm_or_si128);
126        impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
127        impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
128        impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
129        impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
130        impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
131            type Output = Self;
132            #[inline(always)]
133            fn andnot(self, rhs: Self) -> Self {
134                Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
135            }
136        }
137    };
138}
139
140macro_rules! impl_bitops32 {
141    ($vec:ident) => {
142        impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
143            $vec<S3, S4, NI>: RotateEachWord32
144        {
145        }
146    };
147}
148
149macro_rules! impl_bitops64 {
150    ($vec:ident) => {
151        impl_bitops32!($vec);
152        impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
153            $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
154        {
155        }
156    };
157}
158
159macro_rules! impl_bitops128 {
160    ($vec:ident) => {
161        impl_bitops64!($vec);
162        impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
163            $vec<S3, S4, NI>: RotateEachWord128
164        {
165        }
166    };
167}
168
169macro_rules! rotr_32_s3 {
170    ($name:ident, $k0:expr, $k1:expr) => {
171        #[inline(always)]
172        fn $name(self) -> Self {
173            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
174        }
175    };
176}
177macro_rules! rotr_32 {
178    ($name:ident, $i:expr) => {
179        #[inline(always)]
180        fn $name(self) -> Self {
181            Self::new(unsafe {
182                _mm_or_si128(
183                    _mm_srli_epi32(self.x, $i as i32),
184                    _mm_slli_epi32(self.x, 32 - $i as i32),
185                )
186            })
187        }
188    };
189}
190impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
191    rotr_32!(rotate_each_word_right7, 7);
192    rotr_32_s3!(
193        rotate_each_word_right8,
194        0x0c0f_0e0d_080b_0a09,
195        0x0407_0605_0003_0201
196    );
197    rotr_32!(rotate_each_word_right11, 11);
198    rotr_32!(rotate_each_word_right12, 12);
199    rotr_32_s3!(
200        rotate_each_word_right16,
201        0x0d0c_0f0e_0908_0b0a,
202        0x0504_0706_0100_0302
203    );
204    rotr_32!(rotate_each_word_right20, 20);
205    rotr_32_s3!(
206        rotate_each_word_right24,
207        0x0e0d_0c0f_0a09_080b,
208        0x0605_0407_0201_0003
209    );
210    rotr_32!(rotate_each_word_right25, 25);
211}
212impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
213    rotr_32!(rotate_each_word_right7, 7);
214    rotr_32!(rotate_each_word_right8, 8);
215    rotr_32!(rotate_each_word_right11, 11);
216    rotr_32!(rotate_each_word_right12, 12);
217    #[inline(always)]
218    fn rotate_each_word_right16(self) -> Self {
219        Self::new(swap16_s2(self.x))
220    }
221    rotr_32!(rotate_each_word_right20, 20);
222    rotr_32!(rotate_each_word_right24, 24);
223    rotr_32!(rotate_each_word_right25, 25);
224}
225
226macro_rules! rotr_64_s3 {
227    ($name:ident, $k0:expr, $k1:expr) => {
228        #[inline(always)]
229        fn $name(self) -> Self {
230            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
231        }
232    };
233}
234macro_rules! rotr_64 {
235    ($name:ident, $i:expr) => {
236        #[inline(always)]
237        fn $name(self) -> Self {
238            Self::new(unsafe {
239                _mm_or_si128(
240                    _mm_srli_epi64(self.x, $i as i32),
241                    _mm_slli_epi64(self.x, 64 - $i as i32),
242                )
243            })
244        }
245    };
246}
247impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
248    rotr_64!(rotate_each_word_right7, 7);
249    rotr_64_s3!(
250        rotate_each_word_right8,
251        0x080f_0e0d_0c0b_0a09,
252        0x0007_0605_0403_0201
253    );
254    rotr_64!(rotate_each_word_right11, 11);
255    rotr_64!(rotate_each_word_right12, 12);
256    rotr_64_s3!(
257        rotate_each_word_right16,
258        0x0908_0f0e_0d0c_0b0a,
259        0x0100_0706_0504_0302
260    );
261    rotr_64!(rotate_each_word_right20, 20);
262    rotr_64_s3!(
263        rotate_each_word_right24,
264        0x0a09_080f_0e0d_0c0b,
265        0x0201_0007_0605_0403
266    );
267    rotr_64!(rotate_each_word_right25, 25);
268}
269impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
270    rotr_64!(rotate_each_word_right7, 7);
271    rotr_64!(rotate_each_word_right8, 8);
272    rotr_64!(rotate_each_word_right11, 11);
273    rotr_64!(rotate_each_word_right12, 12);
274    #[inline(always)]
275    fn rotate_each_word_right16(self) -> Self {
276        Self::new(swap16_s2(self.x))
277    }
278    rotr_64!(rotate_each_word_right20, 20);
279    rotr_64!(rotate_each_word_right24, 24);
280    rotr_64!(rotate_each_word_right25, 25);
281}
282impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
283    #[inline(always)]
284    fn rotate_each_word_right32(self) -> Self {
285        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
286    }
287}
288
289macro_rules! rotr_128 {
290    ($name:ident, $i:expr) => {
291        #[inline(always)]
292        fn $name(self) -> Self {
293            Self::new(unsafe {
294                _mm_or_si128(
295                    _mm_srli_si128(self.x, $i as i32),
296                    _mm_slli_si128(self.x, 128 - $i as i32),
297                )
298            })
299        }
300    };
301}
302// TODO: completely unoptimized
303impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
304    rotr_128!(rotate_each_word_right7, 7);
305    rotr_128!(rotate_each_word_right8, 8);
306    rotr_128!(rotate_each_word_right11, 11);
307    rotr_128!(rotate_each_word_right12, 12);
308    rotr_128!(rotate_each_word_right16, 16);
309    rotr_128!(rotate_each_word_right20, 20);
310    rotr_128!(rotate_each_word_right24, 24);
311    rotr_128!(rotate_each_word_right25, 25);
312}
313// TODO: completely unoptimized
314impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
315    rotr_128!(rotate_each_word_right32, 32);
316}
317impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
318
319def_vec!(u32x4_sse2, u32);
320def_vec!(u64x2_sse2, u64);
321def_vec!(u128x1_sse2, u128);
322
323impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
324    #[inline(always)]
325    fn to_lanes(self) -> [u32; 4] {
326        unsafe {
327            let x = _mm_cvtsi128_si64(self.x) as u64;
328            let y = _mm_extract_epi64(self.x, 1) as u64;
329            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
330        }
331    }
332    #[inline(always)]
333    fn from_lanes(xs: [u32; 4]) -> Self {
334        unsafe {
335            let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
336            x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
337            Self::new(x)
338        }
339    }
340}
341impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
342    #[inline(always)]
343    fn to_lanes(self) -> [u32; 4] {
344        unsafe {
345            let x = _mm_cvtsi128_si64(self.x) as u64;
346            let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
347            [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
348        }
349    }
350    #[inline(always)]
351    fn from_lanes(xs: [u32; 4]) -> Self {
352        unsafe {
353            let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
354            let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
355            let x = _mm_cvtsi64_si128(x);
356            let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
357            Self::new(_mm_or_si128(x, y))
358        }
359    }
360}
361impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
362    #[inline(always)]
363    fn to_lanes(self) -> [u64; 2] {
364        unsafe {
365            [
366                _mm_cvtsi128_si64(self.x) as u64,
367                _mm_extract_epi64(self.x, 1) as u64,
368            ]
369        }
370    }
371    #[inline(always)]
372    fn from_lanes(xs: [u64; 2]) -> Self {
373        unsafe {
374            let mut x = _mm_cvtsi64_si128(xs[0] as i64);
375            x = _mm_insert_epi64(x, xs[1] as i64, 1);
376            Self::new(x)
377        }
378    }
379}
380impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
381    #[inline(always)]
382    fn to_lanes(self) -> [u64; 2] {
383        unsafe {
384            [
385                _mm_cvtsi128_si64(self.x) as u64,
386                _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
387            ]
388        }
389    }
390    #[inline(always)]
391    fn from_lanes(xs: [u64; 2]) -> Self {
392        unsafe {
393            let x = _mm_cvtsi64_si128(xs[0] as i64);
394            let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
395            Self::new(_mm_or_si128(x, y))
396        }
397    }
398}
399impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
400    #[inline(always)]
401    fn to_lanes(self) -> [u128; 1] {
402        unimplemented!()
403    }
404    #[inline(always)]
405    fn from_lanes(xs: [u128; 1]) -> Self {
406        unimplemented!("{:?}", xs)
407    }
408}
409
410impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
411where
412    u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
413{
414    #[inline(always)]
415    fn to_lanes(self) -> [u64; 4] {
416        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
417        [a[0], a[1], b[0], b[1]]
418    }
419    #[inline(always)]
420    fn from_lanes(xs: [u64; 4]) -> Self {
421        let (a, b) = (
422            u64x2_sse2::from_lanes([xs[0], xs[1]]),
423            u64x2_sse2::from_lanes([xs[2], xs[3]]),
424        );
425        x2::new([a, b])
426    }
427}
428
429macro_rules! impl_into {
430    ($from:ident, $to:ident) => {
431        impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
432            #[inline(always)]
433            fn from(x: $from<S3, S4, NI>) -> Self {
434                $to::new(x.x)
435            }
436        }
437    };
438}
439
440impl_into!(u128x1_sse2, u32x4_sse2);
441impl_into!(u128x1_sse2, u64x2_sse2);
442
443impl_bitops32!(u32x4_sse2);
444impl_bitops64!(u64x2_sse2);
445impl_bitops128!(u128x1_sse2);
446
447impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
448    u32x4_sse2<S3, S4, NI>: BSwap
449{
450}
451impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
452    u64x2_sse2<S3, S4, NI>: BSwap
453{
454}
455impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
456impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
457impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
458impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
459
460impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
461where
462    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
463    Machine86<S3, S4, NI>: Machine,
464{
465}
466impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
467where
468    u64x2_sse2<S3, S4, NI>:
469        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
470    Machine86<S3, S4, NI>: Machine,
471{
472}
473impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
474where
475    u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
476    Machine86<S3, S4, NI>: Machine,
477    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
478    u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
479{
480}
481
482impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
483where
484    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
485    Machine86<YesS3, YesS4, NI>: Machine,
486{
487}
488impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
489where
490    u64x2_sse2<YesS3, YesS4, NI>:
491        RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
492    Machine86<YesS3, YesS4, NI>: Machine,
493{
494}
495impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
496where
497    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
498    Machine86<YesS3, YesS4, NI>: Machine,
499    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
500    u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
501{
502}
503
504impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
505    #[inline(always)]
506    unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
507        Self::new(_mm_set_epi32(
508            xs[3] as i32,
509            xs[2] as i32,
510            xs[1] as i32,
511            xs[0] as i32,
512        ))
513    }
514}
515
516impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
517where
518    Self: MultiLane<[u32; 4]>,
519{
520    #[inline(always)]
521    fn extract(self, i: u32) -> u32 {
522        self.to_lanes()[i as usize]
523    }
524    #[inline(always)]
525    fn insert(self, v: u32, i: u32) -> Self {
526        Self::new(unsafe {
527            match i {
528                0 => _mm_insert_epi32(self.x, v as i32, 0),
529                1 => _mm_insert_epi32(self.x, v as i32, 1),
530                2 => _mm_insert_epi32(self.x, v as i32, 2),
531                3 => _mm_insert_epi32(self.x, v as i32, 3),
532                _ => unreachable!(),
533            }
534        })
535    }
536}
537impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
538where
539    Self: MultiLane<[u32; 4]>,
540{
541    #[inline(always)]
542    fn extract(self, i: u32) -> u32 {
543        self.to_lanes()[i as usize]
544    }
545    #[inline(always)]
546    fn insert(self, v: u32, i: u32) -> Self {
547        Self::new(unsafe {
548            match i {
549                0 => {
550                    let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
551                    _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
552                }
553                1 => {
554                    let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
555                    x = _mm_slli_si128(x, 4);
556                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
557                    _mm_shuffle_epi32(x, 0b1110_0001)
558                }
559                2 => {
560                    let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
561                    x = _mm_slli_si128(x, 4);
562                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
563                    _mm_shuffle_epi32(x, 0b1100_1001)
564                }
565                3 => {
566                    let mut x = _mm_slli_si128(self.x, 4);
567                    x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
568                    _mm_shuffle_epi32(x, 0b0011_1001)
569                }
570                _ => unreachable!(),
571            }
572        })
573    }
574}
575
576impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
577    #[inline(always)]
578    fn shuffle_lane_words2301(self) -> Self {
579        self.shuffle2301()
580    }
581    #[inline(always)]
582    fn shuffle_lane_words1230(self) -> Self {
583        self.shuffle1230()
584    }
585    #[inline(always)]
586    fn shuffle_lane_words3012(self) -> Self {
587        self.shuffle3012()
588    }
589}
590
591impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
592    #[inline(always)]
593    fn shuffle2301(self) -> Self {
594        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
595    }
596    #[inline(always)]
597    fn shuffle1230(self) -> Self {
598        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
599    }
600    #[inline(always)]
601    fn shuffle3012(self) -> Self {
602        Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
603    }
604}
605
606impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
607    #[inline(always)]
608    fn shuffle2301(self) -> Self {
609        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
610    }
611    #[inline(always)]
612    fn shuffle3012(self) -> Self {
613        unsafe {
614            x2::new([
615                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
616                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
617            ])
618        }
619    }
620    #[inline(always)]
621    fn shuffle1230(self) -> Self {
622        unsafe {
623            x2::new([
624                u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
625                u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
626            ])
627        }
628    }
629}
630impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
631    #[inline(always)]
632    fn shuffle2301(self) -> Self {
633        x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
634    }
635    #[inline(always)]
636    fn shuffle3012(self) -> Self {
637        unsafe {
638            let a = _mm_srli_si128(self.0[0].x, 8);
639            let b = _mm_slli_si128(self.0[0].x, 8);
640            let c = _mm_srli_si128(self.0[1].x, 8);
641            let d = _mm_slli_si128(self.0[1].x, 8);
642            let da = _mm_or_si128(d, a);
643            let bc = _mm_or_si128(b, c);
644            x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
645        }
646    }
647    #[inline(always)]
648    fn shuffle1230(self) -> Self {
649        unsafe {
650            let a = _mm_srli_si128(self.0[0].x, 8);
651            let b = _mm_slli_si128(self.0[0].x, 8);
652            let c = _mm_srli_si128(self.0[1].x, 8);
653            let d = _mm_slli_si128(self.0[1].x, 8);
654            let da = _mm_or_si128(d, a);
655            let bc = _mm_or_si128(b, c);
656            x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
657        }
658    }
659}
660
661impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
662    #[inline(always)]
663    unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
664        Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
665    }
666}
667
668impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
669    #[inline(always)]
670    fn extract(self, i: u32) -> u64 {
671        unsafe {
672            match i {
673                0 => _mm_cvtsi128_si64(self.x) as u64,
674                1 => _mm_extract_epi64(self.x, 1) as u64,
675                _ => unreachable!(),
676            }
677        }
678    }
679    #[inline(always)]
680    fn insert(self, x: u64, i: u32) -> Self {
681        Self::new(unsafe {
682            match i {
683                0 => _mm_insert_epi64(self.x, x as i64, 0),
684                1 => _mm_insert_epi64(self.x, x as i64, 1),
685                _ => unreachable!(),
686            }
687        })
688    }
689}
690impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
691    #[inline(always)]
692    fn extract(self, i: u32) -> u64 {
693        unsafe {
694            match i {
695                0 => _mm_cvtsi128_si64(self.x) as u64,
696                1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
697                _ => unreachable!(),
698            }
699        }
700    }
701    #[inline(always)]
702    fn insert(self, x: u64, i: u32) -> Self {
703        Self::new(unsafe {
704            match i {
705                0 => _mm_or_si128(
706                    _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
707                    _mm_cvtsi64_si128(x as i64),
708                ),
709                1 => _mm_or_si128(
710                    _mm_move_epi64(self.x),
711                    _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
712                ),
713                _ => unreachable!(),
714            }
715        })
716    }
717}
718
719impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
720    #[inline(always)]
721    fn bswap(self) -> Self {
722        Self::new(unsafe {
723            let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
724            _mm_shuffle_epi8(self.x, k)
725        })
726    }
727}
728#[inline(always)]
729fn bswap32_s2(x: __m128i) -> __m128i {
730    unsafe {
731        let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
732        y = _mm_shufflehi_epi16(y, 0b0001_1011);
733        y = _mm_shufflelo_epi16(y, 0b0001_1011);
734        let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
735        z = _mm_shufflehi_epi16(z, 0b0001_1011);
736        z = _mm_shufflelo_epi16(z, 0b0001_1011);
737        _mm_packus_epi16(y, z)
738    }
739}
740impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
741    #[inline(always)]
742    fn bswap(self) -> Self {
743        Self::new(bswap32_s2(self.x))
744    }
745}
746
747impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
748    #[inline(always)]
749    fn bswap(self) -> Self {
750        Self::new(unsafe {
751            let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
752            _mm_shuffle_epi8(self.x, k)
753        })
754    }
755}
756impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
757    #[inline(always)]
758    fn bswap(self) -> Self {
759        Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
760    }
761}
762
763impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
764    #[inline(always)]
765    fn bswap(self) -> Self {
766        Self::new(unsafe {
767            let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
768            _mm_shuffle_epi8(self.x, k)
769        })
770    }
771}
772impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
773    #[inline(always)]
774    fn bswap(self) -> Self {
775        unimplemented!()
776    }
777}
778
779macro_rules! swapi {
780    ($x:expr, $i:expr, $k:expr) => {
781        unsafe {
782            const K: u8 = $k;
783            let k = _mm_set1_epi8(K as i8);
784            u128x1_sse2::new(_mm_or_si128(
785                _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
786                _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
787            ))
788        }
789    };
790}
791#[inline(always)]
792fn swap16_s2(x: __m128i) -> __m128i {
793    unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
794}
795impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
796    #[inline(always)]
797    fn swap1(self) -> Self {
798        swapi!(self, 1, 0xaa)
799    }
800    #[inline(always)]
801    fn swap2(self) -> Self {
802        swapi!(self, 2, 0xcc)
803    }
804    #[inline(always)]
805    fn swap4(self) -> Self {
806        swapi!(self, 4, 0xf0)
807    }
808    #[inline(always)]
809    fn swap8(self) -> Self {
810        u128x1_sse2::new(unsafe {
811            let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
812            _mm_shuffle_epi8(self.x, k)
813        })
814    }
815    #[inline(always)]
816    fn swap16(self) -> Self {
817        u128x1_sse2::new(unsafe {
818            let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
819            _mm_shuffle_epi8(self.x, k)
820        })
821    }
822    #[inline(always)]
823    fn swap32(self) -> Self {
824        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
825    }
826    #[inline(always)]
827    fn swap64(self) -> Self {
828        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
829    }
830}
831impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
832    #[inline(always)]
833    fn swap1(self) -> Self {
834        swapi!(self, 1, 0xaa)
835    }
836    #[inline(always)]
837    fn swap2(self) -> Self {
838        swapi!(self, 2, 0xcc)
839    }
840    #[inline(always)]
841    fn swap4(self) -> Self {
842        swapi!(self, 4, 0xf0)
843    }
844    #[inline(always)]
845    fn swap8(self) -> Self {
846        u128x1_sse2::new(unsafe {
847            _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
848        })
849    }
850    #[inline(always)]
851    fn swap16(self) -> Self {
852        u128x1_sse2::new(swap16_s2(self.x))
853    }
854    #[inline(always)]
855    fn swap32(self) -> Self {
856        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
857    }
858    #[inline(always)]
859    fn swap64(self) -> Self {
860        u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
861    }
862}
863
864#[derive(Copy, Clone)]
865pub struct G0;
866#[derive(Copy, Clone)]
867pub struct G1;
868
869#[allow(non_camel_case_types)]
870pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
871#[allow(non_camel_case_types)]
872pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
873#[allow(non_camel_case_types)]
874pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
875#[allow(non_camel_case_types)]
876pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
877
878#[allow(non_camel_case_types)]
879pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
880#[allow(non_camel_case_types)]
881pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
882#[allow(non_camel_case_types)]
883pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
884
885impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
886    #[inline(always)]
887    fn to_scalars(self) -> [u32; 16] {
888        transmute!(self)
889    }
890}
891
892impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
893where
894    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
895    Machine86<S3, S4, NI>: Machine,
896    u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
897    u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
898{
899}
900impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
901where
902    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
903    Machine86<S3, S4, NI>: Machine,
904    u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
905    u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
906{
907}
908impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
909where
910    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
911    Machine86<S3, S4, NI>: Machine,
912    u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
913{
914}
915impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
916where
917    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
918    Machine86<S3, S4, NI>: Machine,
919    u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
920    u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
921    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
922    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
923    u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
924{
925}
926
927impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
928where
929    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
930    Avx2Machine<NI>: Machine,
931    u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
932    u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
933{
934}
935impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
936where
937    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
938    Avx2Machine<NI>: Machine,
939    u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
940    u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
941{
942}
943impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
944where
945    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
946    Avx2Machine<NI>: Machine,
947    u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
948{
949}
950impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
951where
952    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
953    Avx2Machine<NI>: Machine,
954    u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
955    u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
956    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
957    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
958    u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
959{
960}
961
962impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
963where
964    u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
965{
966    #[inline(always)]
967    fn extract(self, i: u32) -> u64 {
968        match i {
969            0 => self.0[0].extract(0),
970            1 => self.0[0].extract(1),
971            2 => self.0[1].extract(0),
972            3 => self.0[1].extract(1),
973            _ => panic!(),
974        }
975    }
976    #[inline(always)]
977    fn insert(mut self, w: u64, i: u32) -> Self {
978        match i {
979            0 => self.0[0] = self.0[0].insert(w, 0),
980            1 => self.0[0] = self.0[0].insert(w, 1),
981            2 => self.0[1] = self.0[1].insert(w, 0),
982            3 => self.0[1] = self.0[1].insert(w, 1),
983            _ => panic!(),
984        };
985        self
986    }
987}
988
989impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
990where
991    u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
992    Machine86<S3, S4, NI>: Machine,
993    u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
994    u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
995    u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
996    u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
997{
998}
999impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
1000where
1001    u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1002    Machine86<S3, S4, NI>: Machine,
1003    u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
1004    u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1005{
1006}
1007impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1008where
1009    u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1010    Machine86<S3, S4, NI>: Machine,
1011    u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1012    u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1013    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1014    u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1015{
1016}
1017
1018impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1019where
1020    u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1021    Avx2Machine<NI>: Machine,
1022    u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1023    u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1024{
1025}
1026impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1027where
1028    u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1029    Avx2Machine<NI>: Machine,
1030    u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1031    u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1032    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1033    u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1034{
1035}
1036
1037macro_rules! impl_into_x {
1038    ($from:ident, $to:ident) => {
1039        impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1040            for x2<$to<S3, S4, NI>, Gt>
1041        {
1042            #[inline(always)]
1043            fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1044                x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1045            }
1046        }
1047        impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1048            #[inline(always)]
1049            fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1050                x4::new([
1051                    $to::from(x.0[0]),
1052                    $to::from(x.0[1]),
1053                    $to::from(x.0[2]),
1054                    $to::from(x.0[3]),
1055                ])
1056            }
1057        }
1058    };
1059}
1060impl_into_x!(u128x1_sse2, u64x2_sse2);
1061impl_into_x!(u128x1_sse2, u32x4_sse2);
1062
1063///// Debugging
1064
1065use core::fmt::{Debug, Formatter, Result};
1066
1067impl<W: PartialEq, G> PartialEq for x2<W, G> {
1068    #[inline(always)]
1069    fn eq(&self, rhs: &Self) -> bool {
1070        self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1071    }
1072}
1073
1074#[allow(unused)]
1075#[inline(always)]
1076unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1077    let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1078    _mm_cvtsi128_si64(q) == -1
1079}
1080
1081#[inline(always)]
1082unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1083    let q = _mm_cmpeq_epi32(x, y);
1084    let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1085    let q = _mm_cvtsi128_si64(q);
1086    (p & q) == -1
1087}
1088
1089impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1090    #[inline(always)]
1091    fn eq(&self, rhs: &Self) -> bool {
1092        unsafe { eq128_s2(self.x, rhs.x) }
1093    }
1094}
1095impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1096where
1097    Self: Copy + MultiLane<[u32; 4]>,
1098{
1099    #[cold]
1100    fn fmt(&self, fmt: &mut Formatter) -> Result {
1101        fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1102    }
1103}
1104
1105impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1106    #[inline(always)]
1107    fn eq(&self, rhs: &Self) -> bool {
1108        unsafe { eq128_s2(self.x, rhs.x) }
1109    }
1110}
1111impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1112where
1113    Self: Copy + MultiLane<[u64; 2]>,
1114{
1115    #[cold]
1116    fn fmt(&self, fmt: &mut Formatter) -> Result {
1117        fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1118    }
1119}
1120
1121impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1122where
1123    u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1124{
1125    #[cold]
1126    fn fmt(&self, fmt: &mut Formatter) -> Result {
1127        let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1128        fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1129    }
1130}
1131
1132#[cfg(test)]
1133#[cfg(target_arch = "x86_64")]
1134mod test {
1135    use super::*;
1136    use crate::x86_64::{SSE2, SSE41, SSSE3};
1137    use crate::Machine;
1138
1139    #[test]
1140    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1141    fn test_bswap32_s2_vs_s3() {
1142        let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1143        let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1144
1145        let s2 = unsafe { SSE2::instance() };
1146        let s3 = unsafe { SSSE3::instance() };
1147
1148        let x_s2 = {
1149            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1150            x_s2.bswap()
1151        };
1152
1153        let x_s3 = {
1154            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1155            x_s3.bswap()
1156        };
1157
1158        assert_eq!(x_s2, transmute!(x_s3));
1159        assert_eq!(x_s2, s2.vec(ys));
1160    }
1161
1162    #[test]
1163    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1164    fn test_bswap64_s2_vs_s3() {
1165        let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1166        let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1167
1168        let s2 = unsafe { SSE2::instance() };
1169        let s3 = unsafe { SSSE3::instance() };
1170
1171        let x_s2 = {
1172            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1173            x_s2.bswap()
1174        };
1175
1176        let x_s3 = {
1177            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1178            x_s3.bswap()
1179        };
1180
1181        assert_eq!(x_s2, s2.vec(ys));
1182        assert_eq!(x_s3, transmute!(x_s3));
1183    }
1184
1185    #[test]
1186    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1187    fn test_shuffle32_s2_vs_s3() {
1188        let xs = [0x0, 0x1, 0x2, 0x3];
1189        let ys = [0x2, 0x3, 0x0, 0x1];
1190        let zs = [0x1, 0x2, 0x3, 0x0];
1191
1192        let s2 = unsafe { SSE2::instance() };
1193        let s3 = unsafe { SSSE3::instance() };
1194
1195        let x_s2 = {
1196            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1197            x_s2.shuffle2301()
1198        };
1199        let x_s3 = {
1200            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1201            x_s3.shuffle2301()
1202        };
1203        assert_eq!(x_s2, s2.vec(ys));
1204        assert_eq!(x_s3, transmute!(x_s3));
1205
1206        let x_s2 = {
1207            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1208            x_s2.shuffle3012()
1209        };
1210        let x_s3 = {
1211            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1212            x_s3.shuffle3012()
1213        };
1214        assert_eq!(x_s2, s2.vec(zs));
1215        assert_eq!(x_s3, transmute!(x_s3));
1216
1217        let x_s2 = x_s2.shuffle1230();
1218        let x_s3 = x_s3.shuffle1230();
1219        assert_eq!(x_s2, s2.vec(xs));
1220        assert_eq!(x_s3, transmute!(x_s3));
1221    }
1222
1223    #[test]
1224    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1225    fn test_shuffle64_s2_vs_s3() {
1226        let xs = [0x0, 0x1, 0x2, 0x3];
1227        let ys = [0x2, 0x3, 0x0, 0x1];
1228        let zs = [0x1, 0x2, 0x3, 0x0];
1229
1230        let s2 = unsafe { SSE2::instance() };
1231        let s3 = unsafe { SSSE3::instance() };
1232
1233        let x_s2 = {
1234            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1235            x_s2.shuffle2301()
1236        };
1237        let x_s3 = {
1238            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1239            x_s3.shuffle2301()
1240        };
1241        assert_eq!(x_s2, s2.vec(ys));
1242        assert_eq!(x_s3, transmute!(x_s3));
1243
1244        let x_s2 = {
1245            let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1246            x_s2.shuffle3012()
1247        };
1248        let x_s3 = {
1249            let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1250            x_s3.shuffle3012()
1251        };
1252        assert_eq!(x_s2, s2.vec(zs));
1253        assert_eq!(x_s3, transmute!(x_s3));
1254
1255        let x_s2 = x_s2.shuffle1230();
1256        let x_s3 = x_s3.shuffle1230();
1257        assert_eq!(x_s2, s2.vec(xs));
1258        assert_eq!(x_s3, transmute!(x_s3));
1259    }
1260
1261    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1262    #[test]
1263    fn test_lanes_u32x4() {
1264        let xs = [0x1, 0x2, 0x3, 0x4];
1265
1266        let s2 = unsafe { SSE2::instance() };
1267        let s3 = unsafe { SSSE3::instance() };
1268        let s4 = unsafe { SSE41::instance() };
1269
1270        {
1271            let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1272            let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1273            assert_eq!(x_s2, y_s2);
1274            assert_eq!(xs, y_s2.to_lanes());
1275        }
1276
1277        {
1278            let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1279            let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1280            assert_eq!(x_s3, y_s3);
1281            assert_eq!(xs, y_s3.to_lanes());
1282        }
1283
1284        {
1285            let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1286            let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1287            assert_eq!(x_s4, y_s4);
1288            assert_eq!(xs, y_s4.to_lanes());
1289        }
1290    }
1291
1292    #[test]
1293    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1294    fn test_lanes_u64x2() {
1295        let xs = [0x1, 0x2];
1296
1297        let s2 = unsafe { SSE2::instance() };
1298        let s3 = unsafe { SSSE3::instance() };
1299        let s4 = unsafe { SSE41::instance() };
1300
1301        {
1302            let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1303            let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1304            assert_eq!(x_s2, y_s2);
1305            assert_eq!(xs, y_s2.to_lanes());
1306        }
1307
1308        {
1309            let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1310            let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1311            assert_eq!(x_s3, y_s3);
1312            assert_eq!(xs, y_s3.to_lanes());
1313        }
1314
1315        {
1316            let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1317            let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1318            assert_eq!(x_s4, y_s4);
1319            assert_eq!(xs, y_s4.to_lanes());
1320        }
1321    }
1322
1323    #[test]
1324    fn test_vec4_u32x4_s2() {
1325        let xs = [1, 2, 3, 4];
1326        let s2 = unsafe { SSE2::instance() };
1327        let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1328        assert_eq!(x_s2.extract(0), 1);
1329        assert_eq!(x_s2.extract(1), 2);
1330        assert_eq!(x_s2.extract(2), 3);
1331        assert_eq!(x_s2.extract(3), 4);
1332        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1333        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1334        assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1335        assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1336    }
1337
1338    #[test]
1339    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1340    fn test_vec4_u32x4_s4() {
1341        let xs = [1, 2, 3, 4];
1342        let s4 = unsafe { SSE41::instance() };
1343        let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1344        assert_eq!(x_s4.extract(0), 1);
1345        assert_eq!(x_s4.extract(1), 2);
1346        assert_eq!(x_s4.extract(2), 3);
1347        assert_eq!(x_s4.extract(3), 4);
1348        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1349        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1350        assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1351        assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1352    }
1353
1354    #[test]
1355    fn test_vec2_u64x2_s2() {
1356        let xs = [0x1, 0x2];
1357        let s2 = unsafe { SSE2::instance() };
1358        let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1359        assert_eq!(x_s2.extract(0), 1);
1360        assert_eq!(x_s2.extract(1), 2);
1361        assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1362        assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1363    }
1364
1365    #[test]
1366    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1367    fn test_vec4_u64x2_s4() {
1368        let xs = [0x1, 0x2];
1369        let s4 = unsafe { SSE41::instance() };
1370        let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1371        assert_eq!(x_s4.extract(0), 1);
1372        assert_eq!(x_s4.extract(1), 2);
1373        assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1374        assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1375    }
1376}
1377
1378pub mod avx2 {
1379    #![allow(non_camel_case_types)]
1380    use crate::soft::{x2, x4};
1381    use crate::types::*;
1382    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
1383    use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1384    use core::arch::x86_64::*;
1385    use core::marker::PhantomData;
1386    use core::ops::*;
1387    use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes};
1388
1389    #[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
1390    #[repr(transparent)]
1391    pub struct u32x4x2_avx2<NI> {
1392        x: __m256i,
1393        ni: PhantomData<NI>,
1394    }
1395
1396    impl<NI> u32x4x2_avx2<NI> {
1397        #[inline(always)]
1398        fn new(x: __m256i) -> Self {
1399            Self { x, ni: PhantomData }
1400        }
1401    }
1402
1403    impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
1404    impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
1405        #[inline(always)]
1406        unsafe fn unpack(p: vec256_storage) -> Self {
1407            Self::new(p.avx)
1408        }
1409    }
1410    impl<NI> StoreBytes for u32x4x2_avx2<NI> {
1411        #[inline(always)]
1412        unsafe fn unsafe_read_le(input: &[u8]) -> Self {
1413            assert_eq!(input.len(), 32);
1414            Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
1415        }
1416        #[inline(always)]
1417        unsafe fn unsafe_read_be(input: &[u8]) -> Self {
1418            Self::unsafe_read_le(input).bswap()
1419        }
1420        #[inline(always)]
1421        fn write_le(self, out: &mut [u8]) {
1422            unsafe {
1423                assert_eq!(out.len(), 32);
1424                _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
1425            }
1426        }
1427        #[inline(always)]
1428        fn write_be(self, out: &mut [u8]) {
1429            self.bswap().write_le(out)
1430        }
1431    }
1432    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
1433        #[inline(always)]
1434        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
1435            unsafe {
1436                [
1437                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1438                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1439                ]
1440            }
1441        }
1442        #[inline(always)]
1443        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
1444            Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
1445        }
1446    }
1447    impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
1448        #[inline(always)]
1449        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1450            unsafe {
1451                match i {
1452                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1453                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1454                    _ => panic!(),
1455                }
1456            }
1457        }
1458        #[inline(always)]
1459        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1460            Self::new(unsafe {
1461                match i {
1462                    0 => _mm256_inserti128_si256(self.x, w.x, 0),
1463                    1 => _mm256_inserti128_si256(self.x, w.x, 1),
1464                    _ => panic!(),
1465                }
1466            })
1467        }
1468    }
1469    impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
1470    impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
1471    macro_rules! shuf_lane_bytes {
1472        ($name:ident, $k0:expr, $k1:expr) => {
1473            #[inline(always)]
1474            fn $name(self) -> Self {
1475                Self::new(unsafe {
1476                    _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
1477                })
1478            }
1479        };
1480    }
1481    macro_rules! rotr_32 {
1482        ($name:ident, $i:expr) => {
1483            #[inline(always)]
1484            fn $name(self) -> Self {
1485                Self::new(unsafe {
1486                    _mm256_or_si256(
1487                        _mm256_srli_epi32(self.x, $i as i32),
1488                        _mm256_slli_epi32(self.x, 32 - $i as i32),
1489                    )
1490                })
1491            }
1492        };
1493    }
1494    impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
1495        rotr_32!(rotate_each_word_right7, 7);
1496        shuf_lane_bytes!(
1497            rotate_each_word_right8,
1498            0x0c0f_0e0d_080b_0a09,
1499            0x0407_0605_0003_0201
1500        );
1501        rotr_32!(rotate_each_word_right11, 11);
1502        rotr_32!(rotate_each_word_right12, 12);
1503        shuf_lane_bytes!(
1504            rotate_each_word_right16,
1505            0x0d0c_0f0e_0908_0b0a,
1506            0x0504_0706_0100_0302
1507        );
1508        rotr_32!(rotate_each_word_right20, 20);
1509        shuf_lane_bytes!(
1510            rotate_each_word_right24,
1511            0x0e0d_0c0f_0a09_080b,
1512            0x0605_0407_0201_0003
1513        );
1514        rotr_32!(rotate_each_word_right25, 25);
1515    }
1516    impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
1517    impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
1518        #[inline(always)]
1519        fn from(x: u32x4x2_avx2<NI>) -> Self {
1520            Self { avx: x.x }
1521        }
1522    }
1523
1524    macro_rules! impl_assign {
1525        ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1526            impl<NI> $Assign for $vec<NI>
1527            where
1528                NI: Copy,
1529            {
1530                #[inline(always)]
1531                fn $assign_fn(&mut self, rhs: Self) {
1532                    *self = self.$bin_fn(rhs);
1533                }
1534            }
1535        };
1536    }
1537    impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
1538    impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
1539    impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
1540    impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
1541
1542    macro_rules! impl_bitop {
1543        ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1544            impl<NI> $Op for $vec<NI> {
1545                type Output = Self;
1546                #[inline(always)]
1547                fn $op_fn(self, rhs: Self) -> Self::Output {
1548                    Self::new(unsafe { $impl_fn(self.x, rhs.x) })
1549                }
1550            }
1551        };
1552    }
1553    impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
1554    impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
1555    impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
1556    impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
1557    impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
1558
1559    impl<NI> Not for u32x4x2_avx2<NI> {
1560        type Output = Self;
1561        #[inline(always)]
1562        fn not(self) -> Self::Output {
1563            unsafe {
1564                let f = _mm256_set1_epi8(-0x7f);
1565                Self::new(f) ^ self
1566            }
1567        }
1568    }
1569
1570    impl<NI> BSwap for u32x4x2_avx2<NI> {
1571        shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1572    }
1573
1574    impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
1575    where
1576        NI: Copy,
1577    {
1578        #[inline(always)]
1579        fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
1580            Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
1581        }
1582    }
1583
1584    impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
1585        #[inline(always)]
1586        fn shuffle_lane_words1230(self) -> Self {
1587            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
1588        }
1589        #[inline(always)]
1590        fn shuffle_lane_words2301(self) -> Self {
1591            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
1592        }
1593        #[inline(always)]
1594        fn shuffle_lane_words3012(self) -> Self {
1595            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
1596        }
1597    }
1598
1599    ///////////////////////////////////////////////////////////////////////////////////////////
1600
1601    pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
1602    impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
1603
1604    impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
1605        #[inline(always)]
1606        unsafe fn unpack(p: vec512_storage) -> Self {
1607            Self::new([
1608                u32x4x2_avx2::unpack(p.avx[0]),
1609                u32x4x2_avx2::unpack(p.avx[1]),
1610            ])
1611        }
1612    }
1613    impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1614        #[inline(always)]
1615        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1616            let [a, b] = self.0[0].to_lanes();
1617            let [c, d] = self.0[1].to_lanes();
1618            [a, b, c, d]
1619        }
1620        #[inline(always)]
1621        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1622            let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
1623            let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
1624            Self::new([ab, cd])
1625        }
1626    }
1627    impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1628        #[inline(always)]
1629        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1630            match i {
1631                0 => self.0[0].extract(0),
1632                1 => self.0[0].extract(1),
1633                2 => self.0[1].extract(0),
1634                3 => self.0[1].extract(1),
1635                _ => panic!(),
1636            }
1637        }
1638        #[inline(always)]
1639        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1640            Self::new(match i {
1641                0 | 1 => [self.0[0].insert(w, i), self.0[1]],
1642                2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
1643                _ => panic!(),
1644            })
1645        }
1646    }
1647    impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1648        #[inline(always)]
1649        fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
1650            /*
1651             * a00:a01 a10:a11
1652             * b00:b01 b10:b11
1653             * c00:c01 c10:c11
1654             * d00:d01 d10:d11
1655             *       =>
1656             * a00:b00 c00:d00
1657             * a01:b01 c01:d01
1658             * a10:b10 c10:d10
1659             * a11:b11 c11:d11
1660             */
1661            unsafe {
1662                let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
1663                let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
1664                let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
1665                let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
1666                let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
1667                let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
1668                let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
1669                let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
1670                (
1671                    Self::new([ab00, cd00]),
1672                    Self::new([ab01, cd01]),
1673                    Self::new([ab10, cd10]),
1674                    Self::new([ab11, cd11]),
1675                )
1676            }
1677        }
1678    }
1679    impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
1680        #[inline(always)]
1681        fn to_scalars(self) -> [u32; 16] {
1682            transmute!(self)
1683        }
1684    }
1685    impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
1686        #[inline(always)]
1687        fn from(x: u32x4x4_avx2<NI>) -> Self {
1688            Self {
1689                avx: [
1690                    vec256_storage { avx: x.0[0].x },
1691                    vec256_storage { avx: x.0[1].x },
1692                ],
1693            }
1694        }
1695    }
1696    impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
1697        #[inline(always)]
1698        fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1699            Self::new(unsafe {
1700                [
1701                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
1702                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
1703                ]
1704            })
1705        }
1706    }
1707}