simdutf8/implementation/x86/
sse42.rs

1//! Contains the x86-64/x86 SSE4.2 UTF-8 validation implementation.
2
3#![allow(clippy::too_many_arguments)]
4
5#[cfg(target_arch = "x86")]
6use core::arch::x86::{
7    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_loadu_si128, _mm_movemask_epi8,
8    _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8,
9    _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
10};
11#[cfg(target_arch = "x86_64")]
12use core::arch::x86_64::{
13    __m128i, _mm_alignr_epi8, _mm_and_si128, _mm_cmpgt_epi8, _mm_loadu_si128, _mm_movemask_epi8,
14    _mm_or_si128, _mm_prefetch, _mm_set1_epi8, _mm_setr_epi8, _mm_setzero_si128, _mm_shuffle_epi8,
15    _mm_srli_epi16, _mm_subs_epu8, _mm_testz_si128, _mm_xor_si128, _MM_HINT_T0,
16};
17
18use crate::implementation::helpers::Utf8CheckAlgorithm;
19
20// SSE 4.2 SIMD primitives
21
22type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m128i>;
23
24impl SimdU8Value {
25    #[target_feature(enable = "sse4.2")]
26    #[inline]
27    unsafe fn from_32_cut_off_leading(
28        _v0: u8,
29        _v1: u8,
30        _v2: u8,
31        _v3: u8,
32        _v4: u8,
33        _v5: u8,
34        _v6: u8,
35        _v7: u8,
36        _v8: u8,
37        _v9: u8,
38        _v10: u8,
39        _v11: u8,
40        _v12: u8,
41        _v13: u8,
42        _v14: u8,
43        _v15: u8,
44        v16: u8,
45        v17: u8,
46        v18: u8,
47        v19: u8,
48        v20: u8,
49        v21: u8,
50        v22: u8,
51        v23: u8,
52        v24: u8,
53        v25: u8,
54        v26: u8,
55        v27: u8,
56        v28: u8,
57        v29: u8,
58        v30: u8,
59        v31: u8,
60    ) -> Self {
61        #[allow(clippy::cast_possible_wrap)]
62        Self::from(_mm_setr_epi8(
63            v16 as i8, v17 as i8, v18 as i8, v19 as i8, v20 as i8, v21 as i8, v22 as i8, v23 as i8,
64            v24 as i8, v25 as i8, v26 as i8, v27 as i8, v28 as i8, v29 as i8, v30 as i8, v31 as i8,
65        ))
66    }
67
68    #[target_feature(enable = "sse4.2")]
69    #[inline]
70    unsafe fn repeat_16(
71        v0: u8,
72        v1: u8,
73        v2: u8,
74        v3: u8,
75        v4: u8,
76        v5: u8,
77        v6: u8,
78        v7: u8,
79        v8: u8,
80        v9: u8,
81        v10: u8,
82        v11: u8,
83        v12: u8,
84        v13: u8,
85        v14: u8,
86        v15: u8,
87    ) -> Self {
88        #[allow(clippy::cast_possible_wrap)]
89        Self::from(_mm_setr_epi8(
90            v0 as i8, v1 as i8, v2 as i8, v3 as i8, v4 as i8, v5 as i8, v6 as i8, v7 as i8,
91            v8 as i8, v9 as i8, v10 as i8, v11 as i8, v12 as i8, v13 as i8, v14 as i8, v15 as i8,
92        ))
93    }
94
95    #[target_feature(enable = "sse4.2")]
96    #[inline]
97    unsafe fn load_from(ptr: *const u8) -> Self {
98        #[allow(clippy::cast_ptr_alignment)]
99        Self::from(_mm_loadu_si128(ptr.cast::<__m128i>()))
100    }
101
102    #[target_feature(enable = "sse4.2")]
103    #[inline]
104    unsafe fn lookup_16(
105        self,
106        v0: u8,
107        v1: u8,
108        v2: u8,
109        v3: u8,
110        v4: u8,
111        v5: u8,
112        v6: u8,
113        v7: u8,
114        v8: u8,
115        v9: u8,
116        v10: u8,
117        v11: u8,
118        v12: u8,
119        v13: u8,
120        v14: u8,
121        v15: u8,
122    ) -> Self {
123        Self::from(_mm_shuffle_epi8(
124            Self::repeat_16(
125                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
126            )
127            .0,
128            self.0,
129        ))
130    }
131
132    #[target_feature(enable = "sse4.2")]
133    #[inline]
134    unsafe fn splat(val: u8) -> Self {
135        #[allow(clippy::cast_possible_wrap)]
136        Self::from(_mm_set1_epi8(val as i8))
137    }
138
139    #[target_feature(enable = "sse4.2")]
140    #[inline]
141    unsafe fn splat0() -> Self {
142        Self::from(_mm_setzero_si128())
143    }
144
145    #[target_feature(enable = "sse4.2")]
146    #[inline]
147    unsafe fn or(self, b: Self) -> Self {
148        Self::from(_mm_or_si128(self.0, b.0))
149    }
150
151    #[target_feature(enable = "sse4.2")]
152    #[inline]
153    unsafe fn and(self, b: Self) -> Self {
154        Self::from(_mm_and_si128(self.0, b.0))
155    }
156
157    #[target_feature(enable = "sse4.2")]
158    #[inline]
159    unsafe fn xor(self, b: Self) -> Self {
160        Self::from(_mm_xor_si128(self.0, b.0))
161    }
162
163    #[target_feature(enable = "sse4.2")]
164    #[inline]
165    unsafe fn saturating_sub(self, b: Self) -> Self {
166        Self::from(_mm_subs_epu8(self.0, b.0))
167    }
168
169    // ugly but shr<N> requires const generics
170    #[target_feature(enable = "sse4.2")]
171    #[inline]
172    unsafe fn shr4(self) -> Self {
173        Self::from(_mm_srli_epi16(self.0, 4)).and(Self::splat(0xFF >> 4))
174    }
175
176    // ugly but prev<N> requires const generics
177    #[target_feature(enable = "sse4.2")]
178    #[inline]
179    unsafe fn prev1(self, prev: Self) -> Self {
180        Self::from(_mm_alignr_epi8(self.0, prev.0, 16 - 1))
181    }
182
183    // ugly but prev<N> requires const generics
184    #[target_feature(enable = "sse4.2")]
185    #[inline]
186    unsafe fn prev2(self, prev: Self) -> Self {
187        Self::from(_mm_alignr_epi8(self.0, prev.0, 16 - 2))
188    }
189
190    // ugly but prev<N> requires const generics
191    #[target_feature(enable = "sse4.2")]
192    #[inline]
193    unsafe fn prev3(self, prev: Self) -> Self {
194        Self::from(_mm_alignr_epi8(self.0, prev.0, 16 - 3))
195    }
196
197    #[target_feature(enable = "sse4.2")]
198    #[inline]
199    unsafe fn signed_gt(self, other: Self) -> Self {
200        Self::from(_mm_cmpgt_epi8(self.0, other.0))
201    }
202
203    #[target_feature(enable = "sse4.2")]
204    #[inline]
205    unsafe fn any_bit_set(self) -> bool {
206        _mm_testz_si128(self.0, self.0) != 1
207    }
208
209    #[target_feature(enable = "sse4.2")]
210    #[inline]
211    unsafe fn is_ascii(self) -> bool {
212        _mm_movemask_epi8(self.0) == 0
213    }
214}
215
216impl From<__m128i> for SimdU8Value {
217    #[inline]
218    fn from(val: __m128i) -> Self {
219        Self(val)
220    }
221}
222
223impl Utf8CheckAlgorithm<SimdU8Value> {
224    #[target_feature(enable = "sse4.2")]
225    #[inline]
226    unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value {
227        let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0b1110_0000 - 1));
228        let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0b1111_0000 - 1));
229
230        is_third_byte
231            .or(is_fourth_byte)
232            .signed_gt(SimdU8Value::splat0())
233    }
234}
235
236#[target_feature(enable = "sse4.2")]
237#[inline]
238unsafe fn simd_prefetch(ptr: *const u8) {
239    _mm_prefetch(ptr.cast::<i8>(), _MM_HINT_T0);
240}
241
242const PREFETCH: bool = false;
243use crate::implementation::helpers::TempSimdChunkA16 as TempSimdChunk;
244simd_input_128_bit!("sse4.2");
245algorithm_simd!("sse4.2");