simdutf8/implementation/x86/
avx2.rs

1//! Contains the x86-64/x86 AVX2 UTF-8 validation implementation.
2
3#![allow(clippy::too_many_arguments)]
4
5#[cfg(target_arch = "x86")]
6use core::arch::x86::{
7    __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256,
8    _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8,
9    _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16,
10    _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
11};
12#[cfg(target_arch = "x86_64")]
13use core::arch::x86_64::{
14    __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256,
15    _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8,
16    _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16,
17    _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
18};
19
20use crate::implementation::helpers::Utf8CheckAlgorithm;
21
22// AVX 2 SIMD primitives
23
24type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m256i>;
25
26impl SimdU8Value {
27    #[target_feature(enable = "avx2")]
28    #[inline]
29    unsafe fn from_32_cut_off_leading(
30        v0: u8,
31        v1: u8,
32        v2: u8,
33        v3: u8,
34        v4: u8,
35        v5: u8,
36        v6: u8,
37        v7: u8,
38        v8: u8,
39        v9: u8,
40        v10: u8,
41        v11: u8,
42        v12: u8,
43        v13: u8,
44        v14: u8,
45        v15: u8,
46        v16: u8,
47        v17: u8,
48        v18: u8,
49        v19: u8,
50        v20: u8,
51        v21: u8,
52        v22: u8,
53        v23: u8,
54        v24: u8,
55        v25: u8,
56        v26: u8,
57        v27: u8,
58        v28: u8,
59        v29: u8,
60        v30: u8,
61        v31: u8,
62    ) -> Self {
63        #[allow(clippy::cast_possible_wrap)]
64        Self::from(_mm256_setr_epi8(
65            v0 as i8, v1 as i8, v2 as i8, v3 as i8, v4 as i8, v5 as i8, v6 as i8, v7 as i8,
66            v8 as i8, v9 as i8, v10 as i8, v11 as i8, v12 as i8, v13 as i8, v14 as i8, v15 as i8,
67            v16 as i8, v17 as i8, v18 as i8, v19 as i8, v20 as i8, v21 as i8, v22 as i8, v23 as i8,
68            v24 as i8, v25 as i8, v26 as i8, v27 as i8, v28 as i8, v29 as i8, v30 as i8, v31 as i8,
69        ))
70    }
71
72    #[target_feature(enable = "avx2")]
73    #[inline]
74    unsafe fn repeat_16(
75        v0: u8,
76        v1: u8,
77        v2: u8,
78        v3: u8,
79        v4: u8,
80        v5: u8,
81        v6: u8,
82        v7: u8,
83        v8: u8,
84        v9: u8,
85        v10: u8,
86        v11: u8,
87        v12: u8,
88        v13: u8,
89        v14: u8,
90        v15: u8,
91    ) -> Self {
92        #[allow(clippy::cast_possible_wrap)]
93        Self::from_32_cut_off_leading(
94            v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v0, v1, v2, v3,
95            v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
96        )
97    }
98
99    #[target_feature(enable = "avx2")]
100    #[inline]
101    unsafe fn load_from(ptr: *const u8) -> Self {
102        #[allow(clippy::cast_ptr_alignment)]
103        Self::from(_mm256_loadu_si256(ptr.cast::<__m256i>()))
104    }
105
106    #[target_feature(enable = "avx2")]
107    #[inline]
108    unsafe fn lookup_16(
109        self,
110        v0: u8,
111        v1: u8,
112        v2: u8,
113        v3: u8,
114        v4: u8,
115        v5: u8,
116        v6: u8,
117        v7: u8,
118        v8: u8,
119        v9: u8,
120        v10: u8,
121        v11: u8,
122        v12: u8,
123        v13: u8,
124        v14: u8,
125        v15: u8,
126    ) -> Self {
127        Self::from(_mm256_shuffle_epi8(
128            Self::repeat_16(
129                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
130            )
131            .0,
132            self.0,
133        ))
134    }
135
136    #[target_feature(enable = "avx2")]
137    #[inline]
138    unsafe fn splat(val: u8) -> Self {
139        #[allow(clippy::cast_possible_wrap)]
140        Self::from(_mm256_set1_epi8(val as i8))
141    }
142
143    #[target_feature(enable = "avx2")]
144    #[inline]
145    unsafe fn splat0() -> Self {
146        Self::from(_mm256_setzero_si256())
147    }
148
149    #[target_feature(enable = "avx2")]
150    #[inline]
151    unsafe fn or(self, b: Self) -> Self {
152        Self::from(_mm256_or_si256(self.0, b.0))
153    }
154
155    #[target_feature(enable = "avx2")]
156    #[inline]
157    unsafe fn and(self, b: Self) -> Self {
158        Self::from(_mm256_and_si256(self.0, b.0))
159    }
160
161    #[target_feature(enable = "avx2")]
162    #[inline]
163    unsafe fn xor(self, b: Self) -> Self {
164        Self::from(_mm256_xor_si256(self.0, b.0))
165    }
166
167    #[target_feature(enable = "avx2")]
168    #[inline]
169    unsafe fn saturating_sub(self, b: Self) -> Self {
170        Self::from(_mm256_subs_epu8(self.0, b.0))
171    }
172
173    // ugly but shr<N> requires const generics
174    #[target_feature(enable = "avx2")]
175    #[inline]
176    unsafe fn shr4(self) -> Self {
177        Self::from(_mm256_srli_epi16(self.0, 4)).and(Self::splat(0xFF >> 4))
178    }
179
180    // ugly but prev<N> requires const generics
181    #[target_feature(enable = "avx2")]
182    #[inline]
183    unsafe fn prev1(self, prev: Self) -> Self {
184        Self::from(_mm256_alignr_epi8(
185            self.0,
186            _mm256_permute2x128_si256(prev.0, self.0, 0x21),
187            16 - 1,
188        ))
189    }
190
191    // ugly but prev<N> requires const generics
192    #[target_feature(enable = "avx2")]
193    #[inline]
194    unsafe fn prev2(self, prev: Self) -> Self {
195        Self::from(_mm256_alignr_epi8(
196            self.0,
197            _mm256_permute2x128_si256(prev.0, self.0, 0x21),
198            16 - 2,
199        ))
200    }
201
202    // ugly but prev<N> requires const generics
203    #[target_feature(enable = "avx2")]
204    #[inline]
205    unsafe fn prev3(self, prev: Self) -> Self {
206        Self::from(_mm256_alignr_epi8(
207            self.0,
208            _mm256_permute2x128_si256(prev.0, self.0, 0x21),
209            16 - 3,
210        ))
211    }
212
213    #[target_feature(enable = "avx2")]
214    #[inline]
215    unsafe fn signed_gt(self, other: Self) -> Self {
216        Self::from(_mm256_cmpgt_epi8(self.0, other.0))
217    }
218
219    #[target_feature(enable = "avx2")]
220    #[inline]
221    unsafe fn any_bit_set(self) -> bool {
222        _mm256_testz_si256(self.0, self.0) != 1
223    }
224
225    #[target_feature(enable = "avx2")]
226    #[inline]
227    unsafe fn is_ascii(self) -> bool {
228        _mm256_movemask_epi8(self.0) == 0
229    }
230}
231
232impl From<__m256i> for SimdU8Value {
233    #[inline]
234    fn from(val: __m256i) -> Self {
235        Self(val)
236    }
237}
238
239impl Utf8CheckAlgorithm<SimdU8Value> {
240    #[target_feature(enable = "avx2")]
241    #[inline]
242    unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value {
243        let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0b1110_0000 - 1));
244        let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0b1111_0000 - 1));
245
246        is_third_byte
247            .or(is_fourth_byte)
248            .signed_gt(SimdU8Value::splat0())
249    }
250}
251
252#[target_feature(enable = "avx2")]
253#[inline]
254unsafe fn simd_prefetch(ptr: *const u8) {
255    _mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
256}
257
258const PREFETCH: bool = true;
259use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk;
260simd_input_256_bit!("avx2");
261algorithm_simd!("avx2");