simdutf8/
basic.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
//! The `basic` API flavor provides barebones UTF-8 checking at the highest speed.
//!
//! It is fastest on valid UTF-8, but only checks for errors after processing the whole byte sequence
//! and does not provide detailed information if the data is not valid UTF-8. [`Utf8Error`] is a zero-sized error struct.
//!
//! If you need detailed error information use the functions from the [`crate::compat`] module instead.

use core::str::{from_utf8_unchecked, from_utf8_unchecked_mut};

use crate::implementation::validate_utf8_basic;

/// Simple zero-sized UTF-8 error.
///
/// No information is provided where the error occured or how long the invalid byte
/// byte sequence is.
#[derive(Copy, Eq, PartialEq, Clone, Debug)]
pub struct Utf8Error;

impl core::fmt::Display for Utf8Error {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        f.write_str("invalid utf-8 sequence")
    }
}

#[cfg(feature = "std")]
impl std::error::Error for Utf8Error {}

/// Analogue to [`std::str::from_utf8()`].
///
/// Checks if the passed byte sequence is valid UTF-8 and returns an
/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
///
/// # Errors
/// Will return the zero-sized Err([`Utf8Error`]) on if the input contains invalid UTF-8.
#[inline]
pub fn from_utf8(input: &[u8]) -> Result<&str, Utf8Error> {
    unsafe {
        validate_utf8_basic(input)?;
        Ok(from_utf8_unchecked(input))
    }
}

/// Analogue to [`std::str::from_utf8_mut()`].
///
/// Checks if the passed mutable byte sequence is valid UTF-8 and returns a mutable
/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
///
/// # Errors
/// Will return the zero-sized Err([`Utf8Error`]) on if the input contains invalid UTF-8.
#[inline]
pub fn from_utf8_mut(input: &mut [u8]) -> Result<&mut str, Utf8Error> {
    unsafe {
        validate_utf8_basic(input)?;
        Ok(from_utf8_unchecked_mut(input))
    }
}

/// Allows direct access to the platform-specific unsafe validation implementations.
#[cfg(feature = "public_imp")]
pub mod imp {
    use crate::basic;

    /// A low-level interfacne for streaming validation of UTF-8 data. It is meant to be integrated
    /// in high-performance data processing pipelines.
    ///
    /// Data can be streamed in arbitrarily-sized chunks using the [`Self::update()`] method. There is
    /// no way to find out if the input so far was valid UTF-8 during the validation. Only when
    /// the validation is completed with the [`Self::finalize()`] method the result of the validation is
    /// returned. Use [`ChunkedUtf8Validator`] if possible for highest performance.
    ///
    /// This implementation requires CPU SIMD features specified by the module it resides in.
    /// It is undefined behavior to use it if the required CPU features are not available which
    /// is why all trait methods are `unsafe`.
    ///
    /// General usage:
    /// ```rust
    /// use simdutf8::basic::imp::Utf8Validator;
    /// use std::io::{stdin, Read, Result};
    ///
    /// # #[cfg(target_arch = "x86_64")]
    /// fn main() -> Result<()> {
    ///     unsafe {
    ///         if !std::is_x86_feature_detected!("avx2") {
    ///             panic!("This example only works with CPUs supporting AVX 2");
    ///         }
    ///
    ///         let mut validator = simdutf8::basic::imp::x86::avx2::Utf8ValidatorImp::new();
    ///         let mut buf = vec![0; 8192];
    ///         loop {
    ///             let bytes_read = stdin().read(buf.as_mut())?;
    ///             if bytes_read == 0 {
    ///                 break;
    ///             }
    ///             validator.update(&buf);
    ///         }
    ///
    ///         if validator.finalize().is_ok() {
    ///             println!("Input is valid UTF-8");
    ///         } else {
    ///             println!("Input is not valid UTF-8");
    ///         }
    ///     }
    ///
    ///     Ok(())
    /// }
    ///
    /// # #[cfg(not(target_arch = "x86_64"))]
    /// # fn main() { }
    /// ```
    ///
    pub trait Utf8Validator {
        /// Creates a new validator.
        ///
        /// # Safety
        /// This implementation requires CPU SIMD features specified by the module it resides in.
        /// It is undefined behavior to call it if the required CPU features are not available.
        #[must_use]
        unsafe fn new() -> Self
        where
            Self: Sized;

        /// Updates the validator with `input`.
        ///
        /// # Safety
        /// This implementation requires CPU SIMD features specified by the module it resides in.
        /// It is undefined behavior to call it if the required CPU features are not available.
        unsafe fn update(&mut self, input: &[u8]);

        /// Finishes the validation and returns `Ok(())` if the input was valid UTF-8.
        ///
        /// # Errors
        /// A [`basic::Utf8Error`] is returned if the input was not valid UTF-8. No
        /// further information about the location of the error is provided.
        ///
        /// # Safety
        /// This implementation requires CPU SIMD features specified by the module it resides in.
        /// It is undefined behavior to call it if the required CPU features are not available.
        unsafe fn finalize(self) -> core::result::Result<(), basic::Utf8Error>;
    }

    /// Like [`Utf8Validator`] this low-level API is for streaming validation of UTF-8 data.
    /// It has additional restrictions imposed on how the input is passed in to allow
    /// validation with as little overhead as possible.
    ///
    /// To feed it data you need to call the [`Self::update_from_chunks()`] method which takes slices which
    /// have to be a multiple of 64 bytes long. The method will panic otherwise.  There is
    /// no way to find out if the input so far was valid UTF-8 during the validation. Only when
    /// the validation is completed with the [`Self::finalize()`] method the result of the validation is
    /// returned.
    ///
    /// The `Self::finalize()` method can be fed the rest of the data. There is no restriction on the
    /// data passed to it.
    ///
    /// This implementation requires CPU SIMD features specified by the module it resides in.
    /// It is undefined behavior to use it if the required CPU features are not available which
    /// is why all trait methods are `unsafe`.
    pub trait ChunkedUtf8Validator {
        /// Creates a new validator.
        ///
        /// # Safety
        /// This implementation requires CPU SIMD features specified by the module it resides in.
        /// It is undefined behavior to call it if the required CPU features are not available.
        #[must_use]
        unsafe fn new() -> Self
        where
            Self: Sized;

        /// Updates the validator with `input`.
        ///
        /// # Panics
        /// If `input.len()` is not a multiple of 64.
        ///
        /// # Safety
        /// This implementation requires CPU SIMD features specified by the module it resides in.
        /// It is undefined behavior to call it if the required CPU features are not available.
        unsafe fn update_from_chunks(&mut self, input: &[u8]);

        /// Updates the validator with remaining input if any. There is no restriction on the
        /// data provided.
        ///
        /// Finishes the validation and returns `Ok(())` if the input was valid UTF-8.
        ///
        /// # Errors
        /// A [`basic::Utf8Error`] is returned if the input was not valid UTF-8. No
        /// further information about the location of the error is provided.
        ///
        /// # Safety
        /// This implementation requires CPU SIMD features specified by the module it resides in.
        /// It is undefined behavior to call it if the required CPU features are not available.
        unsafe fn finalize(
            self,
            remaining_input: core::option::Option<&[u8]>,
        ) -> core::result::Result<(), basic::Utf8Error>;
    }

    /// Includes the x86/x86-64 SIMD implementations.
    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))]
    pub mod x86 {
        /// Includes the validation implementation for AVX 2-compatible CPUs.
        ///
        /// Using the provided functionality on CPUs which do not support AVX 2 is undefined
        /// behavior and will very likely cause a crash.
        pub mod avx2 {
            pub use crate::implementation::x86::avx2::validate_utf8_basic as validate_utf8;
            pub use crate::implementation::x86::avx2::ChunkedUtf8ValidatorImp;
            pub use crate::implementation::x86::avx2::Utf8ValidatorImp;
        }
        /// Includes the validation implementation for SSE 4.2-compatible CPUs.
        ///
        /// Using the provided functionality on CPUs which do not support SSE 4.2 is undefined
        /// behavior and will very likely cause a crash.
        pub mod sse42 {
            pub use crate::implementation::x86::sse42::validate_utf8_basic as validate_utf8;
            pub use crate::implementation::x86::sse42::ChunkedUtf8ValidatorImp;
            pub use crate::implementation::x86::sse42::Utf8ValidatorImp;
        }
    }

    /// Includes the aarch64 SIMD implementations.
    #[cfg(all(feature = "aarch64_neon", target_arch = "aarch64"))]
    pub mod aarch64 {
        /// Includes the Neon-based validation implementation for aarch64 CPUs.
        ///
        /// Should be supported on all ARM64 CPUSs. If it is not supported by the operating
        /// system using it is undefined behavior and will likely cause a crash.
        pub mod neon {
            pub use crate::implementation::aarch64::neon::validate_utf8_basic as validate_utf8;
            pub use crate::implementation::aarch64::neon::ChunkedUtf8ValidatorImp;
            pub use crate::implementation::aarch64::neon::Utf8ValidatorImp;
        }
    }

    /// Includes the wasm32 SIMD implementations.
    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
    pub mod wasm32 {
        /// Includes the simd128-based validation implementation for WASM runtimes.
        ///
        /// Using the provided functionality on WASM runtimes that do not support SIMD
        /// instructions will likely cause a crash.
        pub mod simd128 {
            pub use crate::implementation::wasm32::simd128::validate_utf8_basic as validate_utf8;
            pub use crate::implementation::wasm32::simd128::ChunkedUtf8ValidatorImp;
            pub use crate::implementation::wasm32::simd128::Utf8ValidatorImp;
        }
    }
}