ppv_lite86/x86_64/
mod.rs

1// crate minimums: sse2, x86_64
2
3use crate::types::*;
4use core::arch::x86_64::{__m128i, __m256i};
5use zerocopy::{AsBytes, FromBytes, FromZeroes};
6
7mod sse2;
8
9#[derive(Copy, Clone)]
10pub struct YesS3;
11#[derive(Copy, Clone)]
12pub struct NoS3;
13
14#[derive(Copy, Clone)]
15pub struct YesS4;
16#[derive(Copy, Clone)]
17pub struct NoS4;
18
19#[derive(Copy, Clone)]
20pub struct YesA1;
21#[derive(Copy, Clone)]
22pub struct NoA1;
23
24#[derive(Copy, Clone)]
25pub struct YesA2;
26#[derive(Copy, Clone)]
27pub struct NoA2;
28
29#[derive(Copy, Clone)]
30pub struct YesNI;
31#[derive(Copy, Clone)]
32pub struct NoNI;
33
34use core::marker::PhantomData;
35
36#[derive(Copy, Clone)]
37pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
38impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
39where
40    sse2::u128x1_sse2<S3, S4, NI>: Swap64,
41    sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
42    sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
43    sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
44    sse2::u128x1_sse2<S3, S4, NI>: BSwap,
45    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
46    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
47    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
48    sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
49    sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
50{
51    type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
52    type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
53    type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
54
55    type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
56    type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
57    type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
58    type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
59
60    type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
61    type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
62    type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
63
64    #[inline(always)]
65    unsafe fn instance() -> Self {
66        SseMachine(PhantomData)
67    }
68}
69
70#[derive(Copy, Clone)]
71pub struct Avx2Machine<NI>(PhantomData<NI>);
72impl<NI: Copy> Machine for Avx2Machine<NI>
73where
74    sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
75    sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
76    sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
77    sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
78{
79    type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
80    type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
81    type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
82
83    type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>;
84    type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
85    type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
86    type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
87
88    type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
89    type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
90    type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
91
92    #[inline(always)]
93    unsafe fn instance() -> Self {
94        Avx2Machine(PhantomData)
95    }
96}
97
98pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
99pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
100pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
101/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
102/// to avoid expensive SSE/VEX conflicts.
103pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
104pub type AVX2 = Avx2Machine<NoNI>;
105
106/// Generic wrapper for unparameterized storage of any of the possible impls.
107/// Converting into and out of this type should be essentially free, although it may be more
108/// aligned than a particular impl requires.
109#[allow(non_camel_case_types)]
110#[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
111#[repr(C)]
112pub union vec128_storage {
113    u32x4: [u32; 4],
114    u64x2: [u64; 2],
115    u128x1: [u128; 1],
116    sse2: __m128i,
117}
118impl Store<vec128_storage> for vec128_storage {
119    #[inline(always)]
120    unsafe fn unpack(p: vec128_storage) -> Self {
121        p
122    }
123}
124impl<'a> From<&'a vec128_storage> for &'a [u32; 4] {
125    #[inline(always)]
126    fn from(x: &'a vec128_storage) -> Self {
127        unsafe { &x.u32x4 }
128    }
129}
130impl From<[u32; 4]> for vec128_storage {
131    #[inline(always)]
132    fn from(u32x4: [u32; 4]) -> Self {
133        vec128_storage { u32x4 }
134    }
135}
136impl Default for vec128_storage {
137    #[inline(always)]
138    fn default() -> Self {
139        vec128_storage { u128x1: [0] }
140    }
141}
142impl Eq for vec128_storage {}
143impl PartialEq for vec128_storage {
144    #[inline(always)]
145    fn eq(&self, rhs: &Self) -> bool {
146        unsafe { self.u128x1 == rhs.u128x1 }
147    }
148}
149
150#[allow(non_camel_case_types)]
151#[derive(Copy, Clone)]
152pub union vec256_storage {
153    u32x8: [u32; 8],
154    u64x4: [u64; 4],
155    u128x2: [u128; 2],
156    sse2: [vec128_storage; 2],
157    avx: __m256i,
158}
159impl From<[u64; 4]> for vec256_storage {
160    #[inline(always)]
161    fn from(u64x4: [u64; 4]) -> Self {
162        vec256_storage { u64x4 }
163    }
164}
165impl Default for vec256_storage {
166    #[inline(always)]
167    fn default() -> Self {
168        vec256_storage { u128x2: [0, 0] }
169    }
170}
171impl vec256_storage {
172    #[inline(always)]
173    pub fn new128(xs: [vec128_storage; 2]) -> Self {
174        Self { sse2: xs }
175    }
176    #[inline(always)]
177    pub fn split128(self) -> [vec128_storage; 2] {
178        unsafe { self.sse2 }
179    }
180}
181impl Eq for vec256_storage {}
182impl PartialEq for vec256_storage {
183    #[inline(always)]
184    fn eq(&self, rhs: &Self) -> bool {
185        unsafe { self.sse2 == rhs.sse2 }
186    }
187}
188
189#[allow(non_camel_case_types)]
190#[derive(Copy, Clone)]
191pub union vec512_storage {
192    u32x16: [u32; 16],
193    u64x8: [u64; 8],
194    u128x4: [u128; 4],
195    sse2: [vec128_storage; 4],
196    avx: [vec256_storage; 2],
197}
198impl Default for vec512_storage {
199    #[inline(always)]
200    fn default() -> Self {
201        vec512_storage {
202            u128x4: [0, 0, 0, 0],
203        }
204    }
205}
206impl vec512_storage {
207    #[inline(always)]
208    pub fn new128(xs: [vec128_storage; 4]) -> Self {
209        Self { sse2: xs }
210    }
211    #[inline(always)]
212    pub fn split128(self) -> [vec128_storage; 4] {
213        unsafe { self.sse2 }
214    }
215}
216impl Eq for vec512_storage {}
217impl PartialEq for vec512_storage {
218    #[inline(always)]
219    fn eq(&self, rhs: &Self) -> bool {
220        unsafe { self.avx == rhs.avx }
221    }
222}
223
224macro_rules! impl_into {
225    ($storage:ident, $array:ty, $name:ident) => {
226        impl From<$storage> for $array {
227            #[inline(always)]
228            fn from(vec: $storage) -> Self {
229                unsafe { vec.$name }
230            }
231        }
232    };
233}
234impl_into!(vec128_storage, [u32; 4], u32x4);
235impl_into!(vec128_storage, [u64; 2], u64x2);
236impl_into!(vec128_storage, [u128; 1], u128x1);
237impl_into!(vec256_storage, [u32; 8], u32x8);
238impl_into!(vec256_storage, [u64; 4], u64x4);
239impl_into!(vec256_storage, [u128; 2], u128x2);
240impl_into!(vec512_storage, [u32; 16], u32x16);
241impl_into!(vec512_storage, [u64; 8], u64x8);
242impl_into!(vec512_storage, [u128; 4], u128x4);
243
244/// Generate the full set of optimized implementations to take advantage of the most important
245/// hardware feature sets.
246///
247/// This dispatcher is suitable for maximizing throughput.
248#[macro_export]
249macro_rules! dispatch {
250    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
251        #[cfg(feature = "std")]
252        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
253            #[inline(always)]
254            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
255            use std::arch::x86_64::*;
256            #[target_feature(enable = "avx2")]
257            unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
258                let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
259                _mm256_zeroupper();
260                ret
261            }
262            #[target_feature(enable = "avx")]
263            #[target_feature(enable = "sse4.1")]
264            #[target_feature(enable = "ssse3")]
265            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
266                let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
267                _mm256_zeroupper();
268                ret
269            }
270            #[target_feature(enable = "sse4.1")]
271            #[target_feature(enable = "ssse3")]
272            unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
273                fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
274            }
275            #[target_feature(enable = "ssse3")]
276            unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
277                fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
278            }
279            #[target_feature(enable = "sse2")]
280            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
281                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
282            }
283            unsafe {
284                if is_x86_feature_detected!("avx2") {
285                    impl_avx2($($arg),*)
286                } else if is_x86_feature_detected!("avx") {
287                    impl_avx($($arg),*)
288                } else if is_x86_feature_detected!("sse4.1") {
289                    impl_sse41($($arg),*)
290                } else if is_x86_feature_detected!("ssse3") {
291                    impl_ssse3($($arg),*)
292                } else if is_x86_feature_detected!("sse2") {
293                    impl_sse2($($arg),*)
294                } else {
295                    unimplemented!()
296                }
297            }
298        }
299        #[cfg(not(feature = "std"))]
300        #[inline(always)]
301        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
302            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
303            unsafe {
304                if cfg!(target_feature = "avx2") {
305                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
306                } else if cfg!(target_feature = "avx") {
307                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
308                } else if cfg!(target_feature = "sse4.1") {
309                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
310                } else if cfg!(target_feature = "ssse3") {
311                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
312                } else {
313                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
314                }
315            }
316        }
317    };
318    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
319        dispatch!($mach, $MTy, {
320            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
321        });
322    }
323}
324
325/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
326/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
327///
328/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
329/// features (e.g. because they are done infrequently), so minimizing their contribution to code
330/// size is more important.
331#[macro_export]
332macro_rules! dispatch_light128 {
333    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
334        #[cfg(feature = "std")]
335        $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
336            #[inline(always)]
337            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
338            use std::arch::x86_64::*;
339            #[target_feature(enable = "avx")]
340            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
341                fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
342            }
343            #[target_feature(enable = "sse2")]
344            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
345                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
346            }
347            unsafe {
348                if is_x86_feature_detected!("avx") {
349                    impl_avx($($arg),*)
350                } else if is_x86_feature_detected!("sse2") {
351                    impl_sse2($($arg),*)
352                } else {
353                    unimplemented!()
354                }
355            }
356        }
357        #[cfg(not(feature = "std"))]
358        #[inline(always)]
359        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
360            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
361            unsafe {
362                if cfg!(target_feature = "avx2") {
363                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
364                } else if cfg!(target_feature = "avx") {
365                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
366                } else if cfg!(target_feature = "sse4.1") {
367                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
368                } else if cfg!(target_feature = "ssse3") {
369                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
370                } else {
371                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
372                }
373            }
374        }
375    };
376    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
377        dispatch_light128!($mach, $MTy, {
378            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
379        });
380    }
381}
382
383/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
384/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
385///
386/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
387/// features (e.g. because they are done infrequently), so minimizing their contribution to code
388/// size is more important.
389#[macro_export]
390macro_rules! dispatch_light256 {
391    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
392        #[cfg(feature = "std")]
393        $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
394            #[inline(always)]
395            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
396            use std::arch::x86_64::*;
397            #[target_feature(enable = "avx")]
398            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
399                fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
400            }
401            #[target_feature(enable = "sse2")]
402            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
403                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
404            }
405            unsafe {
406                if is_x86_feature_detected!("avx") {
407                    impl_avx($($arg),*)
408                } else if is_x86_feature_detected!("sse2") {
409                    impl_sse2($($arg),*)
410                } else {
411                    unimplemented!()
412                }
413            }
414        }
415        #[cfg(not(feature = "std"))]
416        #[inline(always)]
417        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
418            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
419            unsafe {
420                if cfg!(target_feature = "avx2") {
421                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
422                } else if cfg!(target_feature = "avx") {
423                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
424                } else if cfg!(target_feature = "sse4.1") {
425                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
426                } else if cfg!(target_feature = "ssse3") {
427                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
428                } else {
429                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
430                }
431            }
432        }
433    };
434    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
435        dispatch_light256!($mach, $MTy, {
436            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
437        });
438    }
439}