simdutf8/implementation/
helpers.rs

1type Utf8ErrorCompat = crate::compat::Utf8Error;
2
3#[inline]
4pub(crate) fn validate_utf8_at_offset(input: &[u8], offset: usize) -> Result<(), Utf8ErrorCompat> {
5    #[allow(clippy::cast_possible_truncation)]
6    match core::str::from_utf8(&input[offset..]) {
7        Ok(_) => Ok(()),
8        Err(err) => Err(Utf8ErrorCompat {
9            valid_up_to: err.valid_up_to() + offset,
10            error_len: err.error_len().map(|len| {
11                // never truncates since std::str::err::Utf8Error::error_len() never returns value larger than 4
12                len as u8
13            }),
14        }),
15    }
16}
17
18#[cold]
19#[allow(dead_code)]
20#[allow(clippy::unwrap_used)]
21pub(crate) fn get_compat_error(input: &[u8], failing_block_pos: usize) -> Utf8ErrorCompat {
22    let offset = if failing_block_pos == 0 {
23        // Error must be in this block since it is the first.
24        0
25    } else {
26        // The previous block is OK except for a possible continuation over the block boundary.
27        // We go backwards over the last three bytes of the previous block and find the
28        // last non-continuation byte as a starting point for an std validation. If the last
29        // three bytes are all continuation bytes then the previous block ends with a four byte
30        // UTF-8 codepoint, is thus complete and valid UTF-8. We start the check with the
31        // current block in that case.
32        (1..=3)
33            .find(|i| input[failing_block_pos - i] >> 6 != 0b10)
34            .map_or(failing_block_pos, |i| failing_block_pos - i)
35    };
36    // UNWRAP: safe because the SIMD UTF-8 validation found an error
37    validate_utf8_at_offset(input, offset).unwrap_err()
38}
39
40#[allow(dead_code)]
41pub(crate) unsafe fn memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
42    mut src: *const u8,
43    mut dest: *mut u8,
44    mut len: usize,
45) {
46    // This gets properly auto-vectorized on AVX 2 and SSE 4.2
47    #[inline]
48    unsafe fn memcpy_u64(src: &mut *const u8, dest: &mut *mut u8) {
49        #[allow(clippy::cast_ptr_alignment)]
50        dest.cast::<u64>()
51            .write_unaligned(src.cast::<u64>().read_unaligned());
52        *src = src.offset(8);
53        *dest = dest.offset(8);
54    }
55    if len >= 32 {
56        memcpy_u64(&mut src, &mut dest);
57        memcpy_u64(&mut src, &mut dest);
58        memcpy_u64(&mut src, &mut dest);
59        memcpy_u64(&mut src, &mut dest);
60        len -= 32;
61    }
62    if len >= 16 {
63        memcpy_u64(&mut src, &mut dest);
64        memcpy_u64(&mut src, &mut dest);
65        len -= 16;
66    }
67    if len >= 8 {
68        memcpy_u64(&mut src, &mut dest);
69        len -= 8;
70    }
71    while len > 0 {
72        *dest = *src;
73        src = src.offset(1);
74        dest = dest.offset(1);
75        len -= 1;
76    }
77}
78
79pub(crate) const SIMD_CHUNK_SIZE: usize = 64;
80
81#[repr(C, align(32))]
82#[allow(dead_code)]
83pub(crate) struct Utf8CheckAlgorithm<T> {
84    pub(crate) prev: T,
85    pub(crate) incomplete: T,
86    pub(crate) error: T,
87}
88
89#[repr(C, align(16))]
90#[allow(dead_code)]
91pub(crate) struct TempSimdChunkA16(pub(crate) [u8; SIMD_CHUNK_SIZE]);
92
93#[allow(dead_code)]
94impl TempSimdChunkA16 {
95    #[inline]
96    pub(crate) const fn new() -> Self {
97        Self([0; SIMD_CHUNK_SIZE])
98    }
99}
100
101#[repr(C, align(32))]
102#[allow(dead_code)]
103pub(crate) struct TempSimdChunkA32(pub(crate) [u8; SIMD_CHUNK_SIZE]);
104
105#[allow(dead_code)]
106impl TempSimdChunkA32 {
107    #[inline]
108    pub(crate) const fn new() -> Self {
109        Self([0; SIMD_CHUNK_SIZE])
110    }
111}
112
113#[derive(Clone, Copy)]
114#[allow(dead_code)]
115pub(crate) struct SimdU8Value<T>(pub(crate) T)
116where
117    T: Copy;