1#![allow(clippy::too_many_arguments)]
4
5#[cfg(target_arch = "x86")]
6use core::arch::x86::{
7 __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256,
8 _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8,
9 _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16,
10 _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
11};
12#[cfg(target_arch = "x86_64")]
13use core::arch::x86_64::{
14 __m256i, _mm256_alignr_epi8, _mm256_and_si256, _mm256_cmpgt_epi8, _mm256_loadu_si256,
15 _mm256_movemask_epi8, _mm256_or_si256, _mm256_permute2x128_si256, _mm256_set1_epi8,
16 _mm256_setr_epi8, _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16,
17 _mm256_subs_epu8, _mm256_testz_si256, _mm256_xor_si256, _mm_prefetch, _MM_HINT_T0,
18};
19
20use crate::implementation::helpers::Utf8CheckAlgorithm;
21
22type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m256i>;
25
26impl SimdU8Value {
27 #[target_feature(enable = "avx2")]
28 #[inline]
29 unsafe fn from_32_cut_off_leading(
30 v0: u8,
31 v1: u8,
32 v2: u8,
33 v3: u8,
34 v4: u8,
35 v5: u8,
36 v6: u8,
37 v7: u8,
38 v8: u8,
39 v9: u8,
40 v10: u8,
41 v11: u8,
42 v12: u8,
43 v13: u8,
44 v14: u8,
45 v15: u8,
46 v16: u8,
47 v17: u8,
48 v18: u8,
49 v19: u8,
50 v20: u8,
51 v21: u8,
52 v22: u8,
53 v23: u8,
54 v24: u8,
55 v25: u8,
56 v26: u8,
57 v27: u8,
58 v28: u8,
59 v29: u8,
60 v30: u8,
61 v31: u8,
62 ) -> Self {
63 #[allow(clippy::cast_possible_wrap)]
64 Self::from(_mm256_setr_epi8(
65 v0 as i8, v1 as i8, v2 as i8, v3 as i8, v4 as i8, v5 as i8, v6 as i8, v7 as i8,
66 v8 as i8, v9 as i8, v10 as i8, v11 as i8, v12 as i8, v13 as i8, v14 as i8, v15 as i8,
67 v16 as i8, v17 as i8, v18 as i8, v19 as i8, v20 as i8, v21 as i8, v22 as i8, v23 as i8,
68 v24 as i8, v25 as i8, v26 as i8, v27 as i8, v28 as i8, v29 as i8, v30 as i8, v31 as i8,
69 ))
70 }
71
72 #[target_feature(enable = "avx2")]
73 #[inline]
74 unsafe fn repeat_16(
75 v0: u8,
76 v1: u8,
77 v2: u8,
78 v3: u8,
79 v4: u8,
80 v5: u8,
81 v6: u8,
82 v7: u8,
83 v8: u8,
84 v9: u8,
85 v10: u8,
86 v11: u8,
87 v12: u8,
88 v13: u8,
89 v14: u8,
90 v15: u8,
91 ) -> Self {
92 #[allow(clippy::cast_possible_wrap)]
93 Self::from_32_cut_off_leading(
94 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v0, v1, v2, v3,
95 v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
96 )
97 }
98
99 #[target_feature(enable = "avx2")]
100 #[inline]
101 unsafe fn load_from(ptr: *const u8) -> Self {
102 #[allow(clippy::cast_ptr_alignment)]
103 Self::from(_mm256_loadu_si256(ptr.cast::<__m256i>()))
104 }
105
106 #[target_feature(enable = "avx2")]
107 #[inline]
108 unsafe fn lookup_16(
109 self,
110 v0: u8,
111 v1: u8,
112 v2: u8,
113 v3: u8,
114 v4: u8,
115 v5: u8,
116 v6: u8,
117 v7: u8,
118 v8: u8,
119 v9: u8,
120 v10: u8,
121 v11: u8,
122 v12: u8,
123 v13: u8,
124 v14: u8,
125 v15: u8,
126 ) -> Self {
127 Self::from(_mm256_shuffle_epi8(
128 Self::repeat_16(
129 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
130 )
131 .0,
132 self.0,
133 ))
134 }
135
136 #[target_feature(enable = "avx2")]
137 #[inline]
138 unsafe fn splat(val: u8) -> Self {
139 #[allow(clippy::cast_possible_wrap)]
140 Self::from(_mm256_set1_epi8(val as i8))
141 }
142
143 #[target_feature(enable = "avx2")]
144 #[inline]
145 unsafe fn splat0() -> Self {
146 Self::from(_mm256_setzero_si256())
147 }
148
149 #[target_feature(enable = "avx2")]
150 #[inline]
151 unsafe fn or(self, b: Self) -> Self {
152 Self::from(_mm256_or_si256(self.0, b.0))
153 }
154
155 #[target_feature(enable = "avx2")]
156 #[inline]
157 unsafe fn and(self, b: Self) -> Self {
158 Self::from(_mm256_and_si256(self.0, b.0))
159 }
160
161 #[target_feature(enable = "avx2")]
162 #[inline]
163 unsafe fn xor(self, b: Self) -> Self {
164 Self::from(_mm256_xor_si256(self.0, b.0))
165 }
166
167 #[target_feature(enable = "avx2")]
168 #[inline]
169 unsafe fn saturating_sub(self, b: Self) -> Self {
170 Self::from(_mm256_subs_epu8(self.0, b.0))
171 }
172
173 #[target_feature(enable = "avx2")]
175 #[inline]
176 unsafe fn shr4(self) -> Self {
177 Self::from(_mm256_srli_epi16(self.0, 4)).and(Self::splat(0xFF >> 4))
178 }
179
180 #[target_feature(enable = "avx2")]
182 #[inline]
183 unsafe fn prev1(self, prev: Self) -> Self {
184 Self::from(_mm256_alignr_epi8(
185 self.0,
186 _mm256_permute2x128_si256(prev.0, self.0, 0x21),
187 16 - 1,
188 ))
189 }
190
191 #[target_feature(enable = "avx2")]
193 #[inline]
194 unsafe fn prev2(self, prev: Self) -> Self {
195 Self::from(_mm256_alignr_epi8(
196 self.0,
197 _mm256_permute2x128_si256(prev.0, self.0, 0x21),
198 16 - 2,
199 ))
200 }
201
202 #[target_feature(enable = "avx2")]
204 #[inline]
205 unsafe fn prev3(self, prev: Self) -> Self {
206 Self::from(_mm256_alignr_epi8(
207 self.0,
208 _mm256_permute2x128_si256(prev.0, self.0, 0x21),
209 16 - 3,
210 ))
211 }
212
213 #[target_feature(enable = "avx2")]
214 #[inline]
215 unsafe fn signed_gt(self, other: Self) -> Self {
216 Self::from(_mm256_cmpgt_epi8(self.0, other.0))
217 }
218
219 #[target_feature(enable = "avx2")]
220 #[inline]
221 unsafe fn any_bit_set(self) -> bool {
222 _mm256_testz_si256(self.0, self.0) != 1
223 }
224
225 #[target_feature(enable = "avx2")]
226 #[inline]
227 unsafe fn is_ascii(self) -> bool {
228 _mm256_movemask_epi8(self.0) == 0
229 }
230}
231
232impl From<__m256i> for SimdU8Value {
233 #[inline]
234 fn from(val: __m256i) -> Self {
235 Self(val)
236 }
237}
238
239impl Utf8CheckAlgorithm<SimdU8Value> {
240 #[target_feature(enable = "avx2")]
241 #[inline]
242 unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value {
243 let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0b1110_0000 - 1));
244 let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0b1111_0000 - 1));
245
246 is_third_byte
247 .or(is_fourth_byte)
248 .signed_gt(SimdU8Value::splat0())
249 }
250}
251
252#[target_feature(enable = "avx2")]
253#[inline]
254unsafe fn simd_prefetch(ptr: *const u8) {
255 _mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
256}
257
258const PREFETCH: bool = true;
259use crate::implementation::helpers::TempSimdChunkA32 as TempSimdChunk;
260simd_input_256_bit!("avx2");
261algorithm_simd!("avx2");