1use crate::types::*;
4use core::arch::x86_64::{__m128i, __m256i};
5use zerocopy::{AsBytes, FromBytes, FromZeroes};
6
7mod sse2;
8
9#[derive(Copy, Clone)]
10pub struct YesS3;
11#[derive(Copy, Clone)]
12pub struct NoS3;
13
14#[derive(Copy, Clone)]
15pub struct YesS4;
16#[derive(Copy, Clone)]
17pub struct NoS4;
18
19#[derive(Copy, Clone)]
20pub struct YesA1;
21#[derive(Copy, Clone)]
22pub struct NoA1;
23
24#[derive(Copy, Clone)]
25pub struct YesA2;
26#[derive(Copy, Clone)]
27pub struct NoA2;
28
29#[derive(Copy, Clone)]
30pub struct YesNI;
31#[derive(Copy, Clone)]
32pub struct NoNI;
33
34use core::marker::PhantomData;
35
36#[derive(Copy, Clone)]
37pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
38impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
39where
40 sse2::u128x1_sse2<S3, S4, NI>: Swap64,
41 sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
42 sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
43 sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
44 sse2::u128x1_sse2<S3, S4, NI>: BSwap,
45 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
46 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
47 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
48 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
49 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
50{
51 type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
52 type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
53 type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
54
55 type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
56 type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
57 type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
58 type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
59
60 type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
61 type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
62 type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
63
64 #[inline(always)]
65 unsafe fn instance() -> Self {
66 SseMachine(PhantomData)
67 }
68}
69
70#[derive(Copy, Clone)]
71pub struct Avx2Machine<NI>(PhantomData<NI>);
72impl<NI: Copy> Machine for Avx2Machine<NI>
73where
74 sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
75 sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
76 sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
77 sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
78{
79 type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
80 type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
81 type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
82
83 type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>;
84 type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
85 type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
86 type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
87
88 type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
89 type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
90 type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
91
92 #[inline(always)]
93 unsafe fn instance() -> Self {
94 Avx2Machine(PhantomData)
95 }
96}
97
98pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
99pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
100pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
101pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
104pub type AVX2 = Avx2Machine<NoNI>;
105
106#[allow(non_camel_case_types)]
110#[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
111#[repr(C)]
112pub union vec128_storage {
113 u32x4: [u32; 4],
114 u64x2: [u64; 2],
115 u128x1: [u128; 1],
116 sse2: __m128i,
117}
118impl Store<vec128_storage> for vec128_storage {
119 #[inline(always)]
120 unsafe fn unpack(p: vec128_storage) -> Self {
121 p
122 }
123}
124impl<'a> From<&'a vec128_storage> for &'a [u32; 4] {
125 #[inline(always)]
126 fn from(x: &'a vec128_storage) -> Self {
127 unsafe { &x.u32x4 }
128 }
129}
130impl From<[u32; 4]> for vec128_storage {
131 #[inline(always)]
132 fn from(u32x4: [u32; 4]) -> Self {
133 vec128_storage { u32x4 }
134 }
135}
136impl Default for vec128_storage {
137 #[inline(always)]
138 fn default() -> Self {
139 vec128_storage { u128x1: [0] }
140 }
141}
142impl Eq for vec128_storage {}
143impl PartialEq for vec128_storage {
144 #[inline(always)]
145 fn eq(&self, rhs: &Self) -> bool {
146 unsafe { self.u128x1 == rhs.u128x1 }
147 }
148}
149
150#[allow(non_camel_case_types)]
151#[derive(Copy, Clone)]
152pub union vec256_storage {
153 u32x8: [u32; 8],
154 u64x4: [u64; 4],
155 u128x2: [u128; 2],
156 sse2: [vec128_storage; 2],
157 avx: __m256i,
158}
159impl From<[u64; 4]> for vec256_storage {
160 #[inline(always)]
161 fn from(u64x4: [u64; 4]) -> Self {
162 vec256_storage { u64x4 }
163 }
164}
165impl Default for vec256_storage {
166 #[inline(always)]
167 fn default() -> Self {
168 vec256_storage { u128x2: [0, 0] }
169 }
170}
171impl vec256_storage {
172 #[inline(always)]
173 pub fn new128(xs: [vec128_storage; 2]) -> Self {
174 Self { sse2: xs }
175 }
176 #[inline(always)]
177 pub fn split128(self) -> [vec128_storage; 2] {
178 unsafe { self.sse2 }
179 }
180}
181impl Eq for vec256_storage {}
182impl PartialEq for vec256_storage {
183 #[inline(always)]
184 fn eq(&self, rhs: &Self) -> bool {
185 unsafe { self.sse2 == rhs.sse2 }
186 }
187}
188
189#[allow(non_camel_case_types)]
190#[derive(Copy, Clone)]
191pub union vec512_storage {
192 u32x16: [u32; 16],
193 u64x8: [u64; 8],
194 u128x4: [u128; 4],
195 sse2: [vec128_storage; 4],
196 avx: [vec256_storage; 2],
197}
198impl Default for vec512_storage {
199 #[inline(always)]
200 fn default() -> Self {
201 vec512_storage {
202 u128x4: [0, 0, 0, 0],
203 }
204 }
205}
206impl vec512_storage {
207 #[inline(always)]
208 pub fn new128(xs: [vec128_storage; 4]) -> Self {
209 Self { sse2: xs }
210 }
211 #[inline(always)]
212 pub fn split128(self) -> [vec128_storage; 4] {
213 unsafe { self.sse2 }
214 }
215}
216impl Eq for vec512_storage {}
217impl PartialEq for vec512_storage {
218 #[inline(always)]
219 fn eq(&self, rhs: &Self) -> bool {
220 unsafe { self.avx == rhs.avx }
221 }
222}
223
224macro_rules! impl_into {
225 ($storage:ident, $array:ty, $name:ident) => {
226 impl From<$storage> for $array {
227 #[inline(always)]
228 fn from(vec: $storage) -> Self {
229 unsafe { vec.$name }
230 }
231 }
232 };
233}
234impl_into!(vec128_storage, [u32; 4], u32x4);
235impl_into!(vec128_storage, [u64; 2], u64x2);
236impl_into!(vec128_storage, [u128; 1], u128x1);
237impl_into!(vec256_storage, [u32; 8], u32x8);
238impl_into!(vec256_storage, [u64; 4], u64x4);
239impl_into!(vec256_storage, [u128; 2], u128x2);
240impl_into!(vec512_storage, [u32; 16], u32x16);
241impl_into!(vec512_storage, [u64; 8], u64x8);
242impl_into!(vec512_storage, [u128; 4], u128x4);
243
244#[macro_export]
249macro_rules! dispatch {
250 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
251 #[cfg(feature = "std")]
252 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
253 #[inline(always)]
254 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
255 use std::arch::x86_64::*;
256 #[target_feature(enable = "avx2")]
257 unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
258 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
259 _mm256_zeroupper();
260 ret
261 }
262 #[target_feature(enable = "avx")]
263 #[target_feature(enable = "sse4.1")]
264 #[target_feature(enable = "ssse3")]
265 unsafe fn impl_avx($($arg: $argty),*) -> $ret {
266 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
267 _mm256_zeroupper();
268 ret
269 }
270 #[target_feature(enable = "sse4.1")]
271 #[target_feature(enable = "ssse3")]
272 unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
273 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
274 }
275 #[target_feature(enable = "ssse3")]
276 unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
277 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
278 }
279 #[target_feature(enable = "sse2")]
280 unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
281 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
282 }
283 unsafe {
284 if is_x86_feature_detected!("avx2") {
285 impl_avx2($($arg),*)
286 } else if is_x86_feature_detected!("avx") {
287 impl_avx($($arg),*)
288 } else if is_x86_feature_detected!("sse4.1") {
289 impl_sse41($($arg),*)
290 } else if is_x86_feature_detected!("ssse3") {
291 impl_ssse3($($arg),*)
292 } else if is_x86_feature_detected!("sse2") {
293 impl_sse2($($arg),*)
294 } else {
295 unimplemented!()
296 }
297 }
298 }
299 #[cfg(not(feature = "std"))]
300 #[inline(always)]
301 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
302 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
303 unsafe {
304 if cfg!(target_feature = "avx2") {
305 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
306 } else if cfg!(target_feature = "avx") {
307 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
308 } else if cfg!(target_feature = "sse4.1") {
309 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
310 } else if cfg!(target_feature = "ssse3") {
311 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
312 } else {
313 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
314 }
315 }
316 }
317 };
318 ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
319 dispatch!($mach, $MTy, {
320 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
321 });
322 }
323}
324
325#[macro_export]
332macro_rules! dispatch_light128 {
333 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
334 #[cfg(feature = "std")]
335 $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
336 #[inline(always)]
337 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
338 use std::arch::x86_64::*;
339 #[target_feature(enable = "avx")]
340 unsafe fn impl_avx($($arg: $argty),*) -> $ret {
341 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
342 }
343 #[target_feature(enable = "sse2")]
344 unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
345 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
346 }
347 unsafe {
348 if is_x86_feature_detected!("avx") {
349 impl_avx($($arg),*)
350 } else if is_x86_feature_detected!("sse2") {
351 impl_sse2($($arg),*)
352 } else {
353 unimplemented!()
354 }
355 }
356 }
357 #[cfg(not(feature = "std"))]
358 #[inline(always)]
359 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
360 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
361 unsafe {
362 if cfg!(target_feature = "avx2") {
363 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
364 } else if cfg!(target_feature = "avx") {
365 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
366 } else if cfg!(target_feature = "sse4.1") {
367 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
368 } else if cfg!(target_feature = "ssse3") {
369 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
370 } else {
371 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
372 }
373 }
374 }
375 };
376 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
377 dispatch_light128!($mach, $MTy, {
378 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
379 });
380 }
381}
382
383#[macro_export]
390macro_rules! dispatch_light256 {
391 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
392 #[cfg(feature = "std")]
393 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
394 #[inline(always)]
395 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
396 use std::arch::x86_64::*;
397 #[target_feature(enable = "avx")]
398 unsafe fn impl_avx($($arg: $argty),*) -> $ret {
399 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
400 }
401 #[target_feature(enable = "sse2")]
402 unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
403 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
404 }
405 unsafe {
406 if is_x86_feature_detected!("avx") {
407 impl_avx($($arg),*)
408 } else if is_x86_feature_detected!("sse2") {
409 impl_sse2($($arg),*)
410 } else {
411 unimplemented!()
412 }
413 }
414 }
415 #[cfg(not(feature = "std"))]
416 #[inline(always)]
417 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
418 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
419 unsafe {
420 if cfg!(target_feature = "avx2") {
421 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
422 } else if cfg!(target_feature = "avx") {
423 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
424 } else if cfg!(target_feature = "sse4.1") {
425 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
426 } else if cfg!(target_feature = "ssse3") {
427 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
428 } else {
429 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
430 }
431 }
432 }
433 };
434 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
435 dispatch_light256!($mach, $MTy, {
436 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
437 });
438 }
439}