1use crate::soft::{x2, x4};
2use crate::types::*;
3use crate::vec128_storage;
4use crate::x86_64::Avx2Machine;
5use crate::x86_64::SseMachine as Machine86;
6use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7use core::arch::x86_64::*;
8use core::marker::PhantomData;
9use core::ops::{
10 Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11};
12use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes};
13
14macro_rules! impl_binop {
15 ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
16 impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
17 type Output = Self;
18 #[inline(always)]
19 fn $fn(self, rhs: Self) -> Self::Output {
20 Self::new(unsafe { $impl_fn(self.x, rhs.x) })
21 }
22 }
23 };
24}
25
26macro_rules! impl_binop_assign {
27 ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
28 impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
29 where
30 $vec<S3, S4, NI>: Copy,
31 {
32 #[inline(always)]
33 fn $fn_assign(&mut self, rhs: Self) {
34 *self = self.$fn(rhs);
35 }
36 }
37 };
38}
39
40macro_rules! def_vec {
41 ($vec:ident, $word:ident) => {
42 #[allow(non_camel_case_types)]
43 #[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
44 #[repr(transparent)]
45 pub struct $vec<S3, S4, NI> {
46 x: __m128i,
47 s3: PhantomData<S3>,
48 s4: PhantomData<S4>,
49 ni: PhantomData<NI>,
50 }
51
52 impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
53 #[inline(always)]
54 unsafe fn unpack(x: vec128_storage) -> Self {
55 Self::new(x.sse2)
56 }
57 }
58 impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
59 #[inline(always)]
60 fn from(x: $vec<S3, S4, NI>) -> Self {
61 vec128_storage { sse2: x.x }
62 }
63 }
64 impl<S3, S4, NI> $vec<S3, S4, NI> {
65 #[inline(always)]
66 fn new(x: __m128i) -> Self {
67 $vec {
68 x,
69 s3: PhantomData,
70 s4: PhantomData,
71 ni: PhantomData,
72 }
73 }
74 }
75
76 impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
77 where
78 Self: BSwap,
79 {
80 #[inline(always)]
81 unsafe fn unsafe_read_le(input: &[u8]) -> Self {
82 assert_eq!(input.len(), 16);
83 Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
84 }
85 #[inline(always)]
86 unsafe fn unsafe_read_be(input: &[u8]) -> Self {
87 assert_eq!(input.len(), 16);
88 Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
89 }
90 #[inline(always)]
91 fn write_le(self, out: &mut [u8]) {
92 assert_eq!(out.len(), 16);
93 unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
94 }
95 #[inline(always)]
96 fn write_be(self, out: &mut [u8]) {
97 assert_eq!(out.len(), 16);
98 let x = self.bswap().x;
99 unsafe {
100 _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
101 }
102 }
103 }
104
105 impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
106 #[inline(always)]
107 fn default() -> Self {
108 Self::new(unsafe { _mm_setzero_si128() })
109 }
110 }
111
112 impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
113 type Output = Self;
114 #[inline(always)]
115 fn not(self) -> Self::Output {
116 unsafe {
117 let ff = _mm_set1_epi64x(-1i64);
118 self ^ Self::new(ff)
119 }
120 }
121 }
122
123 impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
124 impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
125 impl_binop!($vec, BitOr, bitor, _mm_or_si128);
126 impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
127 impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
128 impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
129 impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
130 impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
131 type Output = Self;
132 #[inline(always)]
133 fn andnot(self, rhs: Self) -> Self {
134 Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
135 }
136 }
137 };
138}
139
140macro_rules! impl_bitops32 {
141 ($vec:ident) => {
142 impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
143 $vec<S3, S4, NI>: RotateEachWord32
144 {
145 }
146 };
147}
148
149macro_rules! impl_bitops64 {
150 ($vec:ident) => {
151 impl_bitops32!($vec);
152 impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
153 $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
154 {
155 }
156 };
157}
158
159macro_rules! impl_bitops128 {
160 ($vec:ident) => {
161 impl_bitops64!($vec);
162 impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
163 $vec<S3, S4, NI>: RotateEachWord128
164 {
165 }
166 };
167}
168
169macro_rules! rotr_32_s3 {
170 ($name:ident, $k0:expr, $k1:expr) => {
171 #[inline(always)]
172 fn $name(self) -> Self {
173 Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
174 }
175 };
176}
177macro_rules! rotr_32 {
178 ($name:ident, $i:expr) => {
179 #[inline(always)]
180 fn $name(self) -> Self {
181 Self::new(unsafe {
182 _mm_or_si128(
183 _mm_srli_epi32(self.x, $i as i32),
184 _mm_slli_epi32(self.x, 32 - $i as i32),
185 )
186 })
187 }
188 };
189}
190impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
191 rotr_32!(rotate_each_word_right7, 7);
192 rotr_32_s3!(
193 rotate_each_word_right8,
194 0x0c0f_0e0d_080b_0a09,
195 0x0407_0605_0003_0201
196 );
197 rotr_32!(rotate_each_word_right11, 11);
198 rotr_32!(rotate_each_word_right12, 12);
199 rotr_32_s3!(
200 rotate_each_word_right16,
201 0x0d0c_0f0e_0908_0b0a,
202 0x0504_0706_0100_0302
203 );
204 rotr_32!(rotate_each_word_right20, 20);
205 rotr_32_s3!(
206 rotate_each_word_right24,
207 0x0e0d_0c0f_0a09_080b,
208 0x0605_0407_0201_0003
209 );
210 rotr_32!(rotate_each_word_right25, 25);
211}
212impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
213 rotr_32!(rotate_each_word_right7, 7);
214 rotr_32!(rotate_each_word_right8, 8);
215 rotr_32!(rotate_each_word_right11, 11);
216 rotr_32!(rotate_each_word_right12, 12);
217 #[inline(always)]
218 fn rotate_each_word_right16(self) -> Self {
219 Self::new(swap16_s2(self.x))
220 }
221 rotr_32!(rotate_each_word_right20, 20);
222 rotr_32!(rotate_each_word_right24, 24);
223 rotr_32!(rotate_each_word_right25, 25);
224}
225
226macro_rules! rotr_64_s3 {
227 ($name:ident, $k0:expr, $k1:expr) => {
228 #[inline(always)]
229 fn $name(self) -> Self {
230 Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
231 }
232 };
233}
234macro_rules! rotr_64 {
235 ($name:ident, $i:expr) => {
236 #[inline(always)]
237 fn $name(self) -> Self {
238 Self::new(unsafe {
239 _mm_or_si128(
240 _mm_srli_epi64(self.x, $i as i32),
241 _mm_slli_epi64(self.x, 64 - $i as i32),
242 )
243 })
244 }
245 };
246}
247impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
248 rotr_64!(rotate_each_word_right7, 7);
249 rotr_64_s3!(
250 rotate_each_word_right8,
251 0x080f_0e0d_0c0b_0a09,
252 0x0007_0605_0403_0201
253 );
254 rotr_64!(rotate_each_word_right11, 11);
255 rotr_64!(rotate_each_word_right12, 12);
256 rotr_64_s3!(
257 rotate_each_word_right16,
258 0x0908_0f0e_0d0c_0b0a,
259 0x0100_0706_0504_0302
260 );
261 rotr_64!(rotate_each_word_right20, 20);
262 rotr_64_s3!(
263 rotate_each_word_right24,
264 0x0a09_080f_0e0d_0c0b,
265 0x0201_0007_0605_0403
266 );
267 rotr_64!(rotate_each_word_right25, 25);
268}
269impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
270 rotr_64!(rotate_each_word_right7, 7);
271 rotr_64!(rotate_each_word_right8, 8);
272 rotr_64!(rotate_each_word_right11, 11);
273 rotr_64!(rotate_each_word_right12, 12);
274 #[inline(always)]
275 fn rotate_each_word_right16(self) -> Self {
276 Self::new(swap16_s2(self.x))
277 }
278 rotr_64!(rotate_each_word_right20, 20);
279 rotr_64!(rotate_each_word_right24, 24);
280 rotr_64!(rotate_each_word_right25, 25);
281}
282impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
283 #[inline(always)]
284 fn rotate_each_word_right32(self) -> Self {
285 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
286 }
287}
288
289macro_rules! rotr_128 {
290 ($name:ident, $i:expr) => {
291 #[inline(always)]
292 fn $name(self) -> Self {
293 Self::new(unsafe {
294 _mm_or_si128(
295 _mm_srli_si128(self.x, $i as i32),
296 _mm_slli_si128(self.x, 128 - $i as i32),
297 )
298 })
299 }
300 };
301}
302impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
304 rotr_128!(rotate_each_word_right7, 7);
305 rotr_128!(rotate_each_word_right8, 8);
306 rotr_128!(rotate_each_word_right11, 11);
307 rotr_128!(rotate_each_word_right12, 12);
308 rotr_128!(rotate_each_word_right16, 16);
309 rotr_128!(rotate_each_word_right20, 20);
310 rotr_128!(rotate_each_word_right24, 24);
311 rotr_128!(rotate_each_word_right25, 25);
312}
313impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
315 rotr_128!(rotate_each_word_right32, 32);
316}
317impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
318
319def_vec!(u32x4_sse2, u32);
320def_vec!(u64x2_sse2, u64);
321def_vec!(u128x1_sse2, u128);
322
323impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
324 #[inline(always)]
325 fn to_lanes(self) -> [u32; 4] {
326 unsafe {
327 let x = _mm_cvtsi128_si64(self.x) as u64;
328 let y = _mm_extract_epi64(self.x, 1) as u64;
329 [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
330 }
331 }
332 #[inline(always)]
333 fn from_lanes(xs: [u32; 4]) -> Self {
334 unsafe {
335 let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
336 x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
337 Self::new(x)
338 }
339 }
340}
341impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
342 #[inline(always)]
343 fn to_lanes(self) -> [u32; 4] {
344 unsafe {
345 let x = _mm_cvtsi128_si64(self.x) as u64;
346 let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
347 [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
348 }
349 }
350 #[inline(always)]
351 fn from_lanes(xs: [u32; 4]) -> Self {
352 unsafe {
353 let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
354 let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
355 let x = _mm_cvtsi64_si128(x);
356 let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
357 Self::new(_mm_or_si128(x, y))
358 }
359 }
360}
361impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
362 #[inline(always)]
363 fn to_lanes(self) -> [u64; 2] {
364 unsafe {
365 [
366 _mm_cvtsi128_si64(self.x) as u64,
367 _mm_extract_epi64(self.x, 1) as u64,
368 ]
369 }
370 }
371 #[inline(always)]
372 fn from_lanes(xs: [u64; 2]) -> Self {
373 unsafe {
374 let mut x = _mm_cvtsi64_si128(xs[0] as i64);
375 x = _mm_insert_epi64(x, xs[1] as i64, 1);
376 Self::new(x)
377 }
378 }
379}
380impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
381 #[inline(always)]
382 fn to_lanes(self) -> [u64; 2] {
383 unsafe {
384 [
385 _mm_cvtsi128_si64(self.x) as u64,
386 _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
387 ]
388 }
389 }
390 #[inline(always)]
391 fn from_lanes(xs: [u64; 2]) -> Self {
392 unsafe {
393 let x = _mm_cvtsi64_si128(xs[0] as i64);
394 let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
395 Self::new(_mm_or_si128(x, y))
396 }
397 }
398}
399impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
400 #[inline(always)]
401 fn to_lanes(self) -> [u128; 1] {
402 unimplemented!()
403 }
404 #[inline(always)]
405 fn from_lanes(xs: [u128; 1]) -> Self {
406 unimplemented!("{:?}", xs)
407 }
408}
409
410impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
411where
412 u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
413{
414 #[inline(always)]
415 fn to_lanes(self) -> [u64; 4] {
416 let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
417 [a[0], a[1], b[0], b[1]]
418 }
419 #[inline(always)]
420 fn from_lanes(xs: [u64; 4]) -> Self {
421 let (a, b) = (
422 u64x2_sse2::from_lanes([xs[0], xs[1]]),
423 u64x2_sse2::from_lanes([xs[2], xs[3]]),
424 );
425 x2::new([a, b])
426 }
427}
428
429macro_rules! impl_into {
430 ($from:ident, $to:ident) => {
431 impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
432 #[inline(always)]
433 fn from(x: $from<S3, S4, NI>) -> Self {
434 $to::new(x.x)
435 }
436 }
437 };
438}
439
440impl_into!(u128x1_sse2, u32x4_sse2);
441impl_into!(u128x1_sse2, u64x2_sse2);
442
443impl_bitops32!(u32x4_sse2);
444impl_bitops64!(u64x2_sse2);
445impl_bitops128!(u128x1_sse2);
446
447impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
448 u32x4_sse2<S3, S4, NI>: BSwap
449{
450}
451impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
452 u64x2_sse2<S3, S4, NI>: BSwap
453{
454}
455impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
456impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
457impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
458impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
459
460impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
461where
462 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
463 Machine86<S3, S4, NI>: Machine,
464{
465}
466impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
467where
468 u64x2_sse2<S3, S4, NI>:
469 RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
470 Machine86<S3, S4, NI>: Machine,
471{
472}
473impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
474where
475 u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
476 Machine86<S3, S4, NI>: Machine,
477 u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
478 u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
479{
480}
481
482impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
483where
484 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
485 Machine86<YesS3, YesS4, NI>: Machine,
486{
487}
488impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
489where
490 u64x2_sse2<YesS3, YesS4, NI>:
491 RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
492 Machine86<YesS3, YesS4, NI>: Machine,
493{
494}
495impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
496where
497 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
498 Machine86<YesS3, YesS4, NI>: Machine,
499 u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
500 u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
501{
502}
503
504impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
505 #[inline(always)]
506 unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
507 Self::new(_mm_set_epi32(
508 xs[3] as i32,
509 xs[2] as i32,
510 xs[1] as i32,
511 xs[0] as i32,
512 ))
513 }
514}
515
516impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
517where
518 Self: MultiLane<[u32; 4]>,
519{
520 #[inline(always)]
521 fn extract(self, i: u32) -> u32 {
522 self.to_lanes()[i as usize]
523 }
524 #[inline(always)]
525 fn insert(self, v: u32, i: u32) -> Self {
526 Self::new(unsafe {
527 match i {
528 0 => _mm_insert_epi32(self.x, v as i32, 0),
529 1 => _mm_insert_epi32(self.x, v as i32, 1),
530 2 => _mm_insert_epi32(self.x, v as i32, 2),
531 3 => _mm_insert_epi32(self.x, v as i32, 3),
532 _ => unreachable!(),
533 }
534 })
535 }
536}
537impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
538where
539 Self: MultiLane<[u32; 4]>,
540{
541 #[inline(always)]
542 fn extract(self, i: u32) -> u32 {
543 self.to_lanes()[i as usize]
544 }
545 #[inline(always)]
546 fn insert(self, v: u32, i: u32) -> Self {
547 Self::new(unsafe {
548 match i {
549 0 => {
550 let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
551 _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
552 }
553 1 => {
554 let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
555 x = _mm_slli_si128(x, 4);
556 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
557 _mm_shuffle_epi32(x, 0b1110_0001)
558 }
559 2 => {
560 let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
561 x = _mm_slli_si128(x, 4);
562 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
563 _mm_shuffle_epi32(x, 0b1100_1001)
564 }
565 3 => {
566 let mut x = _mm_slli_si128(self.x, 4);
567 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
568 _mm_shuffle_epi32(x, 0b0011_1001)
569 }
570 _ => unreachable!(),
571 }
572 })
573 }
574}
575
576impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
577 #[inline(always)]
578 fn shuffle_lane_words2301(self) -> Self {
579 self.shuffle2301()
580 }
581 #[inline(always)]
582 fn shuffle_lane_words1230(self) -> Self {
583 self.shuffle1230()
584 }
585 #[inline(always)]
586 fn shuffle_lane_words3012(self) -> Self {
587 self.shuffle3012()
588 }
589}
590
591impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
592 #[inline(always)]
593 fn shuffle2301(self) -> Self {
594 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
595 }
596 #[inline(always)]
597 fn shuffle1230(self) -> Self {
598 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
599 }
600 #[inline(always)]
601 fn shuffle3012(self) -> Self {
602 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
603 }
604}
605
606impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
607 #[inline(always)]
608 fn shuffle2301(self) -> Self {
609 x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
610 }
611 #[inline(always)]
612 fn shuffle3012(self) -> Self {
613 unsafe {
614 x2::new([
615 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
616 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
617 ])
618 }
619 }
620 #[inline(always)]
621 fn shuffle1230(self) -> Self {
622 unsafe {
623 x2::new([
624 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
625 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
626 ])
627 }
628 }
629}
630impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
631 #[inline(always)]
632 fn shuffle2301(self) -> Self {
633 x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
634 }
635 #[inline(always)]
636 fn shuffle3012(self) -> Self {
637 unsafe {
638 let a = _mm_srli_si128(self.0[0].x, 8);
639 let b = _mm_slli_si128(self.0[0].x, 8);
640 let c = _mm_srli_si128(self.0[1].x, 8);
641 let d = _mm_slli_si128(self.0[1].x, 8);
642 let da = _mm_or_si128(d, a);
643 let bc = _mm_or_si128(b, c);
644 x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
645 }
646 }
647 #[inline(always)]
648 fn shuffle1230(self) -> Self {
649 unsafe {
650 let a = _mm_srli_si128(self.0[0].x, 8);
651 let b = _mm_slli_si128(self.0[0].x, 8);
652 let c = _mm_srli_si128(self.0[1].x, 8);
653 let d = _mm_slli_si128(self.0[1].x, 8);
654 let da = _mm_or_si128(d, a);
655 let bc = _mm_or_si128(b, c);
656 x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
657 }
658 }
659}
660
661impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
662 #[inline(always)]
663 unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
664 Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
665 }
666}
667
668impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
669 #[inline(always)]
670 fn extract(self, i: u32) -> u64 {
671 unsafe {
672 match i {
673 0 => _mm_cvtsi128_si64(self.x) as u64,
674 1 => _mm_extract_epi64(self.x, 1) as u64,
675 _ => unreachable!(),
676 }
677 }
678 }
679 #[inline(always)]
680 fn insert(self, x: u64, i: u32) -> Self {
681 Self::new(unsafe {
682 match i {
683 0 => _mm_insert_epi64(self.x, x as i64, 0),
684 1 => _mm_insert_epi64(self.x, x as i64, 1),
685 _ => unreachable!(),
686 }
687 })
688 }
689}
690impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
691 #[inline(always)]
692 fn extract(self, i: u32) -> u64 {
693 unsafe {
694 match i {
695 0 => _mm_cvtsi128_si64(self.x) as u64,
696 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
697 _ => unreachable!(),
698 }
699 }
700 }
701 #[inline(always)]
702 fn insert(self, x: u64, i: u32) -> Self {
703 Self::new(unsafe {
704 match i {
705 0 => _mm_or_si128(
706 _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
707 _mm_cvtsi64_si128(x as i64),
708 ),
709 1 => _mm_or_si128(
710 _mm_move_epi64(self.x),
711 _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
712 ),
713 _ => unreachable!(),
714 }
715 })
716 }
717}
718
719impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
720 #[inline(always)]
721 fn bswap(self) -> Self {
722 Self::new(unsafe {
723 let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
724 _mm_shuffle_epi8(self.x, k)
725 })
726 }
727}
728#[inline(always)]
729fn bswap32_s2(x: __m128i) -> __m128i {
730 unsafe {
731 let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
732 y = _mm_shufflehi_epi16(y, 0b0001_1011);
733 y = _mm_shufflelo_epi16(y, 0b0001_1011);
734 let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
735 z = _mm_shufflehi_epi16(z, 0b0001_1011);
736 z = _mm_shufflelo_epi16(z, 0b0001_1011);
737 _mm_packus_epi16(y, z)
738 }
739}
740impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
741 #[inline(always)]
742 fn bswap(self) -> Self {
743 Self::new(bswap32_s2(self.x))
744 }
745}
746
747impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
748 #[inline(always)]
749 fn bswap(self) -> Self {
750 Self::new(unsafe {
751 let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
752 _mm_shuffle_epi8(self.x, k)
753 })
754 }
755}
756impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
757 #[inline(always)]
758 fn bswap(self) -> Self {
759 Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
760 }
761}
762
763impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
764 #[inline(always)]
765 fn bswap(self) -> Self {
766 Self::new(unsafe {
767 let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
768 _mm_shuffle_epi8(self.x, k)
769 })
770 }
771}
772impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
773 #[inline(always)]
774 fn bswap(self) -> Self {
775 unimplemented!()
776 }
777}
778
779macro_rules! swapi {
780 ($x:expr, $i:expr, $k:expr) => {
781 unsafe {
782 const K: u8 = $k;
783 let k = _mm_set1_epi8(K as i8);
784 u128x1_sse2::new(_mm_or_si128(
785 _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
786 _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
787 ))
788 }
789 };
790}
791#[inline(always)]
792fn swap16_s2(x: __m128i) -> __m128i {
793 unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
794}
795impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
796 #[inline(always)]
797 fn swap1(self) -> Self {
798 swapi!(self, 1, 0xaa)
799 }
800 #[inline(always)]
801 fn swap2(self) -> Self {
802 swapi!(self, 2, 0xcc)
803 }
804 #[inline(always)]
805 fn swap4(self) -> Self {
806 swapi!(self, 4, 0xf0)
807 }
808 #[inline(always)]
809 fn swap8(self) -> Self {
810 u128x1_sse2::new(unsafe {
811 let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
812 _mm_shuffle_epi8(self.x, k)
813 })
814 }
815 #[inline(always)]
816 fn swap16(self) -> Self {
817 u128x1_sse2::new(unsafe {
818 let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
819 _mm_shuffle_epi8(self.x, k)
820 })
821 }
822 #[inline(always)]
823 fn swap32(self) -> Self {
824 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
825 }
826 #[inline(always)]
827 fn swap64(self) -> Self {
828 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
829 }
830}
831impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
832 #[inline(always)]
833 fn swap1(self) -> Self {
834 swapi!(self, 1, 0xaa)
835 }
836 #[inline(always)]
837 fn swap2(self) -> Self {
838 swapi!(self, 2, 0xcc)
839 }
840 #[inline(always)]
841 fn swap4(self) -> Self {
842 swapi!(self, 4, 0xf0)
843 }
844 #[inline(always)]
845 fn swap8(self) -> Self {
846 u128x1_sse2::new(unsafe {
847 _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
848 })
849 }
850 #[inline(always)]
851 fn swap16(self) -> Self {
852 u128x1_sse2::new(swap16_s2(self.x))
853 }
854 #[inline(always)]
855 fn swap32(self) -> Self {
856 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
857 }
858 #[inline(always)]
859 fn swap64(self) -> Self {
860 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
861 }
862}
863
864#[derive(Copy, Clone)]
865pub struct G0;
866#[derive(Copy, Clone)]
867pub struct G1;
868
869#[allow(non_camel_case_types)]
870pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
871#[allow(non_camel_case_types)]
872pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
873#[allow(non_camel_case_types)]
874pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
875#[allow(non_camel_case_types)]
876pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
877
878#[allow(non_camel_case_types)]
879pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
880#[allow(non_camel_case_types)]
881pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
882#[allow(non_camel_case_types)]
883pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
884
885impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
886 #[inline(always)]
887 fn to_scalars(self) -> [u32; 16] {
888 transmute!(self)
889 }
890}
891
892impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
893where
894 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
895 Machine86<S3, S4, NI>: Machine,
896 u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
897 u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
898{
899}
900impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
901where
902 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
903 Machine86<S3, S4, NI>: Machine,
904 u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
905 u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
906{
907}
908impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
909where
910 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
911 Machine86<S3, S4, NI>: Machine,
912 u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
913{
914}
915impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
916where
917 u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
918 Machine86<S3, S4, NI>: Machine,
919 u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
920 u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
921 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
922 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
923 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
924{
925}
926
927impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
928where
929 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
930 Avx2Machine<NI>: Machine,
931 u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
932 u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
933{
934}
935impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
936where
937 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
938 Avx2Machine<NI>: Machine,
939 u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
940 u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
941{
942}
943impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
944where
945 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
946 Avx2Machine<NI>: Machine,
947 u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
948{
949}
950impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
951where
952 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
953 Avx2Machine<NI>: Machine,
954 u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
955 u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
956 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
957 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
958 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
959{
960}
961
962impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
963where
964 u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
965{
966 #[inline(always)]
967 fn extract(self, i: u32) -> u64 {
968 match i {
969 0 => self.0[0].extract(0),
970 1 => self.0[0].extract(1),
971 2 => self.0[1].extract(0),
972 3 => self.0[1].extract(1),
973 _ => panic!(),
974 }
975 }
976 #[inline(always)]
977 fn insert(mut self, w: u64, i: u32) -> Self {
978 match i {
979 0 => self.0[0] = self.0[0].insert(w, 0),
980 1 => self.0[0] = self.0[0].insert(w, 1),
981 2 => self.0[1] = self.0[1].insert(w, 0),
982 3 => self.0[1] = self.0[1].insert(w, 1),
983 _ => panic!(),
984 };
985 self
986 }
987}
988
989impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
990where
991 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
992 Machine86<S3, S4, NI>: Machine,
993 u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
994 u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
995 u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
996 u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
997{
998}
999impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
1000where
1001 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1002 Machine86<S3, S4, NI>: Machine,
1003 u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
1004 u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1005{
1006}
1007impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1008where
1009 u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1010 Machine86<S3, S4, NI>: Machine,
1011 u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1012 u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1013 u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1014 u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1015{
1016}
1017
1018impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1019where
1020 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1021 Avx2Machine<NI>: Machine,
1022 u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1023 u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1024{
1025}
1026impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1027where
1028 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1029 Avx2Machine<NI>: Machine,
1030 u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1031 u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1032 u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1033 u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1034{
1035}
1036
1037macro_rules! impl_into_x {
1038 ($from:ident, $to:ident) => {
1039 impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1040 for x2<$to<S3, S4, NI>, Gt>
1041 {
1042 #[inline(always)]
1043 fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1044 x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1045 }
1046 }
1047 impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1048 #[inline(always)]
1049 fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1050 x4::new([
1051 $to::from(x.0[0]),
1052 $to::from(x.0[1]),
1053 $to::from(x.0[2]),
1054 $to::from(x.0[3]),
1055 ])
1056 }
1057 }
1058 };
1059}
1060impl_into_x!(u128x1_sse2, u64x2_sse2);
1061impl_into_x!(u128x1_sse2, u32x4_sse2);
1062
1063use core::fmt::{Debug, Formatter, Result};
1066
1067impl<W: PartialEq, G> PartialEq for x2<W, G> {
1068 #[inline(always)]
1069 fn eq(&self, rhs: &Self) -> bool {
1070 self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1071 }
1072}
1073
1074#[allow(unused)]
1075#[inline(always)]
1076unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1077 let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1078 _mm_cvtsi128_si64(q) == -1
1079}
1080
1081#[inline(always)]
1082unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1083 let q = _mm_cmpeq_epi32(x, y);
1084 let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1085 let q = _mm_cvtsi128_si64(q);
1086 (p & q) == -1
1087}
1088
1089impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1090 #[inline(always)]
1091 fn eq(&self, rhs: &Self) -> bool {
1092 unsafe { eq128_s2(self.x, rhs.x) }
1093 }
1094}
1095impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1096where
1097 Self: Copy + MultiLane<[u32; 4]>,
1098{
1099 #[cold]
1100 fn fmt(&self, fmt: &mut Formatter) -> Result {
1101 fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1102 }
1103}
1104
1105impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1106 #[inline(always)]
1107 fn eq(&self, rhs: &Self) -> bool {
1108 unsafe { eq128_s2(self.x, rhs.x) }
1109 }
1110}
1111impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1112where
1113 Self: Copy + MultiLane<[u64; 2]>,
1114{
1115 #[cold]
1116 fn fmt(&self, fmt: &mut Formatter) -> Result {
1117 fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1118 }
1119}
1120
1121impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1122where
1123 u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1124{
1125 #[cold]
1126 fn fmt(&self, fmt: &mut Formatter) -> Result {
1127 let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1128 fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1129 }
1130}
1131
1132#[cfg(test)]
1133#[cfg(target_arch = "x86_64")]
1134mod test {
1135 use super::*;
1136 use crate::x86_64::{SSE2, SSE41, SSSE3};
1137 use crate::Machine;
1138
1139 #[test]
1140 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1141 fn test_bswap32_s2_vs_s3() {
1142 let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1143 let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1144
1145 let s2 = unsafe { SSE2::instance() };
1146 let s3 = unsafe { SSSE3::instance() };
1147
1148 let x_s2 = {
1149 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1150 x_s2.bswap()
1151 };
1152
1153 let x_s3 = {
1154 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1155 x_s3.bswap()
1156 };
1157
1158 assert_eq!(x_s2, transmute!(x_s3));
1159 assert_eq!(x_s2, s2.vec(ys));
1160 }
1161
1162 #[test]
1163 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1164 fn test_bswap64_s2_vs_s3() {
1165 let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1166 let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1167
1168 let s2 = unsafe { SSE2::instance() };
1169 let s3 = unsafe { SSSE3::instance() };
1170
1171 let x_s2 = {
1172 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1173 x_s2.bswap()
1174 };
1175
1176 let x_s3 = {
1177 let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1178 x_s3.bswap()
1179 };
1180
1181 assert_eq!(x_s2, s2.vec(ys));
1182 assert_eq!(x_s3, transmute!(x_s3));
1183 }
1184
1185 #[test]
1186 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1187 fn test_shuffle32_s2_vs_s3() {
1188 let xs = [0x0, 0x1, 0x2, 0x3];
1189 let ys = [0x2, 0x3, 0x0, 0x1];
1190 let zs = [0x1, 0x2, 0x3, 0x0];
1191
1192 let s2 = unsafe { SSE2::instance() };
1193 let s3 = unsafe { SSSE3::instance() };
1194
1195 let x_s2 = {
1196 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1197 x_s2.shuffle2301()
1198 };
1199 let x_s3 = {
1200 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1201 x_s3.shuffle2301()
1202 };
1203 assert_eq!(x_s2, s2.vec(ys));
1204 assert_eq!(x_s3, transmute!(x_s3));
1205
1206 let x_s2 = {
1207 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1208 x_s2.shuffle3012()
1209 };
1210 let x_s3 = {
1211 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1212 x_s3.shuffle3012()
1213 };
1214 assert_eq!(x_s2, s2.vec(zs));
1215 assert_eq!(x_s3, transmute!(x_s3));
1216
1217 let x_s2 = x_s2.shuffle1230();
1218 let x_s3 = x_s3.shuffle1230();
1219 assert_eq!(x_s2, s2.vec(xs));
1220 assert_eq!(x_s3, transmute!(x_s3));
1221 }
1222
1223 #[test]
1224 #[cfg_attr(not(target_feature = "ssse3"), ignore)]
1225 fn test_shuffle64_s2_vs_s3() {
1226 let xs = [0x0, 0x1, 0x2, 0x3];
1227 let ys = [0x2, 0x3, 0x0, 0x1];
1228 let zs = [0x1, 0x2, 0x3, 0x0];
1229
1230 let s2 = unsafe { SSE2::instance() };
1231 let s3 = unsafe { SSSE3::instance() };
1232
1233 let x_s2 = {
1234 let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1235 x_s2.shuffle2301()
1236 };
1237 let x_s3 = {
1238 let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1239 x_s3.shuffle2301()
1240 };
1241 assert_eq!(x_s2, s2.vec(ys));
1242 assert_eq!(x_s3, transmute!(x_s3));
1243
1244 let x_s2 = {
1245 let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1246 x_s2.shuffle3012()
1247 };
1248 let x_s3 = {
1249 let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1250 x_s3.shuffle3012()
1251 };
1252 assert_eq!(x_s2, s2.vec(zs));
1253 assert_eq!(x_s3, transmute!(x_s3));
1254
1255 let x_s2 = x_s2.shuffle1230();
1256 let x_s3 = x_s3.shuffle1230();
1257 assert_eq!(x_s2, s2.vec(xs));
1258 assert_eq!(x_s3, transmute!(x_s3));
1259 }
1260
1261 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1262 #[test]
1263 fn test_lanes_u32x4() {
1264 let xs = [0x1, 0x2, 0x3, 0x4];
1265
1266 let s2 = unsafe { SSE2::instance() };
1267 let s3 = unsafe { SSSE3::instance() };
1268 let s4 = unsafe { SSE41::instance() };
1269
1270 {
1271 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1272 let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1273 assert_eq!(x_s2, y_s2);
1274 assert_eq!(xs, y_s2.to_lanes());
1275 }
1276
1277 {
1278 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1279 let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1280 assert_eq!(x_s3, y_s3);
1281 assert_eq!(xs, y_s3.to_lanes());
1282 }
1283
1284 {
1285 let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1286 let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1287 assert_eq!(x_s4, y_s4);
1288 assert_eq!(xs, y_s4.to_lanes());
1289 }
1290 }
1291
1292 #[test]
1293 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1294 fn test_lanes_u64x2() {
1295 let xs = [0x1, 0x2];
1296
1297 let s2 = unsafe { SSE2::instance() };
1298 let s3 = unsafe { SSSE3::instance() };
1299 let s4 = unsafe { SSE41::instance() };
1300
1301 {
1302 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1303 let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1304 assert_eq!(x_s2, y_s2);
1305 assert_eq!(xs, y_s2.to_lanes());
1306 }
1307
1308 {
1309 let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1310 let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1311 assert_eq!(x_s3, y_s3);
1312 assert_eq!(xs, y_s3.to_lanes());
1313 }
1314
1315 {
1316 let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1317 let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1318 assert_eq!(x_s4, y_s4);
1319 assert_eq!(xs, y_s4.to_lanes());
1320 }
1321 }
1322
1323 #[test]
1324 fn test_vec4_u32x4_s2() {
1325 let xs = [1, 2, 3, 4];
1326 let s2 = unsafe { SSE2::instance() };
1327 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1328 assert_eq!(x_s2.extract(0), 1);
1329 assert_eq!(x_s2.extract(1), 2);
1330 assert_eq!(x_s2.extract(2), 3);
1331 assert_eq!(x_s2.extract(3), 4);
1332 assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1333 assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1334 assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1335 assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1336 }
1337
1338 #[test]
1339 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1340 fn test_vec4_u32x4_s4() {
1341 let xs = [1, 2, 3, 4];
1342 let s4 = unsafe { SSE41::instance() };
1343 let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1344 assert_eq!(x_s4.extract(0), 1);
1345 assert_eq!(x_s4.extract(1), 2);
1346 assert_eq!(x_s4.extract(2), 3);
1347 assert_eq!(x_s4.extract(3), 4);
1348 assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1349 assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1350 assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1351 assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1352 }
1353
1354 #[test]
1355 fn test_vec2_u64x2_s2() {
1356 let xs = [0x1, 0x2];
1357 let s2 = unsafe { SSE2::instance() };
1358 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1359 assert_eq!(x_s2.extract(0), 1);
1360 assert_eq!(x_s2.extract(1), 2);
1361 assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1362 assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1363 }
1364
1365 #[test]
1366 #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
1367 fn test_vec4_u64x2_s4() {
1368 let xs = [0x1, 0x2];
1369 let s4 = unsafe { SSE41::instance() };
1370 let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1371 assert_eq!(x_s4.extract(0), 1);
1372 assert_eq!(x_s4.extract(1), 2);
1373 assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1374 assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1375 }
1376}
1377
1378pub mod avx2 {
1379 #![allow(non_camel_case_types)]
1380 use crate::soft::{x2, x4};
1381 use crate::types::*;
1382 use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
1383 use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1384 use core::arch::x86_64::*;
1385 use core::marker::PhantomData;
1386 use core::ops::*;
1387 use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes};
1388
1389 #[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
1390 #[repr(transparent)]
1391 pub struct u32x4x2_avx2<NI> {
1392 x: __m256i,
1393 ni: PhantomData<NI>,
1394 }
1395
1396 impl<NI> u32x4x2_avx2<NI> {
1397 #[inline(always)]
1398 fn new(x: __m256i) -> Self {
1399 Self { x, ni: PhantomData }
1400 }
1401 }
1402
1403 impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
1404 impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
1405 #[inline(always)]
1406 unsafe fn unpack(p: vec256_storage) -> Self {
1407 Self::new(p.avx)
1408 }
1409 }
1410 impl<NI> StoreBytes for u32x4x2_avx2<NI> {
1411 #[inline(always)]
1412 unsafe fn unsafe_read_le(input: &[u8]) -> Self {
1413 assert_eq!(input.len(), 32);
1414 Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
1415 }
1416 #[inline(always)]
1417 unsafe fn unsafe_read_be(input: &[u8]) -> Self {
1418 Self::unsafe_read_le(input).bswap()
1419 }
1420 #[inline(always)]
1421 fn write_le(self, out: &mut [u8]) {
1422 unsafe {
1423 assert_eq!(out.len(), 32);
1424 _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
1425 }
1426 }
1427 #[inline(always)]
1428 fn write_be(self, out: &mut [u8]) {
1429 self.bswap().write_le(out)
1430 }
1431 }
1432 impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
1433 #[inline(always)]
1434 fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
1435 unsafe {
1436 [
1437 u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1438 u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1439 ]
1440 }
1441 }
1442 #[inline(always)]
1443 fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
1444 Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
1445 }
1446 }
1447 impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
1448 #[inline(always)]
1449 fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1450 unsafe {
1451 match i {
1452 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
1453 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
1454 _ => panic!(),
1455 }
1456 }
1457 }
1458 #[inline(always)]
1459 fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1460 Self::new(unsafe {
1461 match i {
1462 0 => _mm256_inserti128_si256(self.x, w.x, 0),
1463 1 => _mm256_inserti128_si256(self.x, w.x, 1),
1464 _ => panic!(),
1465 }
1466 })
1467 }
1468 }
1469 impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
1470 impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
1471 macro_rules! shuf_lane_bytes {
1472 ($name:ident, $k0:expr, $k1:expr) => {
1473 #[inline(always)]
1474 fn $name(self) -> Self {
1475 Self::new(unsafe {
1476 _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
1477 })
1478 }
1479 };
1480 }
1481 macro_rules! rotr_32 {
1482 ($name:ident, $i:expr) => {
1483 #[inline(always)]
1484 fn $name(self) -> Self {
1485 Self::new(unsafe {
1486 _mm256_or_si256(
1487 _mm256_srli_epi32(self.x, $i as i32),
1488 _mm256_slli_epi32(self.x, 32 - $i as i32),
1489 )
1490 })
1491 }
1492 };
1493 }
1494 impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
1495 rotr_32!(rotate_each_word_right7, 7);
1496 shuf_lane_bytes!(
1497 rotate_each_word_right8,
1498 0x0c0f_0e0d_080b_0a09,
1499 0x0407_0605_0003_0201
1500 );
1501 rotr_32!(rotate_each_word_right11, 11);
1502 rotr_32!(rotate_each_word_right12, 12);
1503 shuf_lane_bytes!(
1504 rotate_each_word_right16,
1505 0x0d0c_0f0e_0908_0b0a,
1506 0x0504_0706_0100_0302
1507 );
1508 rotr_32!(rotate_each_word_right20, 20);
1509 shuf_lane_bytes!(
1510 rotate_each_word_right24,
1511 0x0e0d_0c0f_0a09_080b,
1512 0x0605_0407_0201_0003
1513 );
1514 rotr_32!(rotate_each_word_right25, 25);
1515 }
1516 impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
1517 impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
1518 #[inline(always)]
1519 fn from(x: u32x4x2_avx2<NI>) -> Self {
1520 Self { avx: x.x }
1521 }
1522 }
1523
1524 macro_rules! impl_assign {
1525 ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1526 impl<NI> $Assign for $vec<NI>
1527 where
1528 NI: Copy,
1529 {
1530 #[inline(always)]
1531 fn $assign_fn(&mut self, rhs: Self) {
1532 *self = self.$bin_fn(rhs);
1533 }
1534 }
1535 };
1536 }
1537 impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
1538 impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
1539 impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
1540 impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
1541
1542 macro_rules! impl_bitop {
1543 ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1544 impl<NI> $Op for $vec<NI> {
1545 type Output = Self;
1546 #[inline(always)]
1547 fn $op_fn(self, rhs: Self) -> Self::Output {
1548 Self::new(unsafe { $impl_fn(self.x, rhs.x) })
1549 }
1550 }
1551 };
1552 }
1553 impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
1554 impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
1555 impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
1556 impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
1557 impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
1558
1559 impl<NI> Not for u32x4x2_avx2<NI> {
1560 type Output = Self;
1561 #[inline(always)]
1562 fn not(self) -> Self::Output {
1563 unsafe {
1564 let f = _mm256_set1_epi8(-0x7f);
1565 Self::new(f) ^ self
1566 }
1567 }
1568 }
1569
1570 impl<NI> BSwap for u32x4x2_avx2<NI> {
1571 shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1572 }
1573
1574 impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
1575 where
1576 NI: Copy,
1577 {
1578 #[inline(always)]
1579 fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
1580 Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
1581 }
1582 }
1583
1584 impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
1585 #[inline(always)]
1586 fn shuffle_lane_words1230(self) -> Self {
1587 Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
1588 }
1589 #[inline(always)]
1590 fn shuffle_lane_words2301(self) -> Self {
1591 Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
1592 }
1593 #[inline(always)]
1594 fn shuffle_lane_words3012(self) -> Self {
1595 Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
1596 }
1597 }
1598
1599 pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
1602 impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
1603
1604 impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
1605 #[inline(always)]
1606 unsafe fn unpack(p: vec512_storage) -> Self {
1607 Self::new([
1608 u32x4x2_avx2::unpack(p.avx[0]),
1609 u32x4x2_avx2::unpack(p.avx[1]),
1610 ])
1611 }
1612 }
1613 impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1614 #[inline(always)]
1615 fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1616 let [a, b] = self.0[0].to_lanes();
1617 let [c, d] = self.0[1].to_lanes();
1618 [a, b, c, d]
1619 }
1620 #[inline(always)]
1621 fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1622 let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
1623 let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
1624 Self::new([ab, cd])
1625 }
1626 }
1627 impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1628 #[inline(always)]
1629 fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1630 match i {
1631 0 => self.0[0].extract(0),
1632 1 => self.0[0].extract(1),
1633 2 => self.0[1].extract(0),
1634 3 => self.0[1].extract(1),
1635 _ => panic!(),
1636 }
1637 }
1638 #[inline(always)]
1639 fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1640 Self::new(match i {
1641 0 | 1 => [self.0[0].insert(w, i), self.0[1]],
1642 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
1643 _ => panic!(),
1644 })
1645 }
1646 }
1647 impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1648 #[inline(always)]
1649 fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
1650 unsafe {
1662 let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
1663 let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
1664 let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
1665 let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
1666 let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
1667 let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
1668 let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
1669 let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
1670 (
1671 Self::new([ab00, cd00]),
1672 Self::new([ab01, cd01]),
1673 Self::new([ab10, cd10]),
1674 Self::new([ab11, cd11]),
1675 )
1676 }
1677 }
1678 }
1679 impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
1680 #[inline(always)]
1681 fn to_scalars(self) -> [u32; 16] {
1682 transmute!(self)
1683 }
1684 }
1685 impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
1686 #[inline(always)]
1687 fn from(x: u32x4x4_avx2<NI>) -> Self {
1688 Self {
1689 avx: [
1690 vec256_storage { avx: x.0[0].x },
1691 vec256_storage { avx: x.0[1].x },
1692 ],
1693 }
1694 }
1695 }
1696 impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
1697 #[inline(always)]
1698 fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1699 Self::new(unsafe {
1700 [
1701 u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
1702 u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
1703 ]
1704 })
1705 }
1706 }
1707}