simdutf8/
basic.rs

1//! The `basic` API flavor provides barebones UTF-8 checking at the highest speed.
2//!
3//! It is fastest on valid UTF-8, but only checks for errors after processing the whole byte sequence
4//! and does not provide detailed information if the data is not valid UTF-8. [`Utf8Error`] is a zero-sized error struct.
5//!
6//! If you need detailed error information use the functions from the [`crate::compat`] module instead.
7
8use core::str::{from_utf8_unchecked, from_utf8_unchecked_mut};
9
10use crate::implementation::validate_utf8_basic;
11
12/// Simple zero-sized UTF-8 error.
13///
14/// No information is provided where the error occurred or how long the invalid byte
15/// byte sequence is.
16#[derive(Copy, Eq, PartialEq, Clone, Debug)]
17pub struct Utf8Error;
18
19impl core::fmt::Display for Utf8Error {
20    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
21        f.write_str("invalid utf-8 sequence")
22    }
23}
24
25#[cfg(feature = "std")]
26impl std::error::Error for Utf8Error {}
27
28/// Analogue to [`std::str::from_utf8()`].
29///
30/// Checks if the passed byte sequence is valid UTF-8 and returns an
31/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
32///
33/// # Errors
34/// Will return the zero-sized Err([`Utf8Error`]) on if the input contains invalid UTF-8.
35#[inline]
36pub fn from_utf8(input: &[u8]) -> Result<&str, Utf8Error> {
37    unsafe {
38        validate_utf8_basic(input)?;
39        Ok(from_utf8_unchecked(input))
40    }
41}
42
43/// Analogue to [`std::str::from_utf8_mut()`].
44///
45/// Checks if the passed mutable byte sequence is valid UTF-8 and returns a mutable
46/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
47///
48/// # Errors
49/// Will return the zero-sized Err([`Utf8Error`]) on if the input contains invalid UTF-8.
50#[inline]
51pub fn from_utf8_mut(input: &mut [u8]) -> Result<&mut str, Utf8Error> {
52    unsafe {
53        validate_utf8_basic(input)?;
54        Ok(from_utf8_unchecked_mut(input))
55    }
56}
57
58/// Allows direct access to the platform-specific unsafe validation implementations.
59#[cfg(feature = "public_imp")]
60pub mod imp {
61    use crate::basic;
62
63    /// A low-level interface for streaming validation of UTF-8 data. It is meant to be integrated
64    /// in high-performance data processing pipelines.
65    ///
66    /// Data can be streamed in arbitrarily-sized chunks using the [`Self::update()`] method. There is
67    /// no way to find out if the input so far was valid UTF-8 during the validation. Only when
68    /// the validation is completed with the [`Self::finalize()`] method the result of the validation is
69    /// returned. Use [`ChunkedUtf8Validator`] if possible for highest performance.
70    ///
71    /// This implementation requires CPU SIMD features specified by the module it resides in.
72    /// It is undefined behavior to use it if the required CPU features are not available which
73    /// is why all trait methods are `unsafe`.
74    ///
75    /// General usage:
76    /// ```rust
77    /// use simdutf8::basic::imp::Utf8Validator;
78    /// use std::io::{stdin, Read, Result};
79    ///
80    /// # #[cfg(target_arch = "x86_64")]
81    /// fn main() -> Result<()> {
82    ///     unsafe {
83    ///         if !std::is_x86_feature_detected!("avx2") {
84    ///             panic!("This example only works with CPUs supporting AVX 2");
85    ///         }
86    ///
87    ///         let mut validator = simdutf8::basic::imp::x86::avx2::Utf8ValidatorImp::new();
88    ///         let mut buf = vec![0; 8192];
89    ///         loop {
90    ///             let bytes_read = stdin().read(buf.as_mut())?;
91    ///             if bytes_read == 0 {
92    ///                 break;
93    ///             }
94    ///             validator.update(&buf);
95    ///         }
96    ///
97    ///         if validator.finalize().is_ok() {
98    ///             println!("Input is valid UTF-8");
99    ///         } else {
100    ///             println!("Input is not valid UTF-8");
101    ///         }
102    ///     }
103    ///
104    ///     Ok(())
105    /// }
106    ///
107    /// # #[cfg(not(target_arch = "x86_64"))]
108    /// # fn main() { }
109    /// ```
110    ///
111    pub trait Utf8Validator {
112        /// Creates a new validator.
113        ///
114        /// # Safety
115        /// This implementation requires CPU SIMD features specified by the module it resides in.
116        /// It is undefined behavior to call it if the required CPU features are not available.
117        #[must_use]
118        unsafe fn new() -> Self
119        where
120            Self: Sized;
121
122        /// Updates the validator with `input`.
123        ///
124        /// # Safety
125        /// This implementation requires CPU SIMD features specified by the module it resides in.
126        /// It is undefined behavior to call it if the required CPU features are not available.
127        unsafe fn update(&mut self, input: &[u8]);
128
129        /// Finishes the validation and returns `Ok(())` if the input was valid UTF-8.
130        ///
131        /// # Errors
132        /// A [`basic::Utf8Error`] is returned if the input was not valid UTF-8. No
133        /// further information about the location of the error is provided.
134        ///
135        /// # Safety
136        /// This implementation requires CPU SIMD features specified by the module it resides in.
137        /// It is undefined behavior to call it if the required CPU features are not available.
138        unsafe fn finalize(self) -> core::result::Result<(), basic::Utf8Error>;
139    }
140
141    /// Like [`Utf8Validator`] this low-level API is for streaming validation of UTF-8 data.
142    ///
143    /// It has additional restrictions imposed on how the input is passed in to allow
144    /// validation with as little overhead as possible.
145    ///
146    /// To feed it data you need to call the [`Self::update_from_chunks()`] method which takes slices which
147    /// have to be a multiple of 64 bytes long. The method will panic otherwise.  There is
148    /// no way to find out if the input so far was valid UTF-8 during the validation. Only when
149    /// the validation is completed with the [`Self::finalize()`] method the result of the validation is
150    /// returned.
151    ///
152    /// The `Self::finalize()` method can be fed the rest of the data. There is no restriction on the
153    /// data passed to it.
154    ///
155    /// This implementation requires CPU SIMD features specified by the module it resides in.
156    /// It is undefined behavior to use it if the required CPU features are not available which
157    /// is why all trait methods are `unsafe`.
158    pub trait ChunkedUtf8Validator {
159        /// Creates a new validator.
160        ///
161        /// # Safety
162        /// This implementation requires CPU SIMD features specified by the module it resides in.
163        /// It is undefined behavior to call it if the required CPU features are not available.
164        #[must_use]
165        unsafe fn new() -> Self
166        where
167            Self: Sized;
168
169        /// Updates the validator with `input`.
170        ///
171        /// # Panics
172        /// If `input.len()` is not a multiple of 64.
173        ///
174        /// # Safety
175        /// This implementation requires CPU SIMD features specified by the module it resides in.
176        /// It is undefined behavior to call it if the required CPU features are not available.
177        unsafe fn update_from_chunks(&mut self, input: &[u8]);
178
179        /// Updates the validator with remaining input if any. There is no restriction on the
180        /// data provided.
181        ///
182        /// Finishes the validation and returns `Ok(())` if the input was valid UTF-8.
183        ///
184        /// # Errors
185        /// A [`basic::Utf8Error`] is returned if the input was not valid UTF-8. No
186        /// further information about the location of the error is provided.
187        ///
188        /// # Safety
189        /// This implementation requires CPU SIMD features specified by the module it resides in.
190        /// It is undefined behavior to call it if the required CPU features are not available.
191        unsafe fn finalize(
192            self,
193            remaining_input: core::option::Option<&[u8]>,
194        ) -> core::result::Result<(), basic::Utf8Error>;
195    }
196
197    /// Includes the x86/x86-64 SIMD implementations.
198    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
199    pub mod x86 {
200        /// Includes the validation implementation for AVX 2-compatible CPUs.
201        ///
202        /// Using the provided functionality on CPUs which do not support AVX 2 is undefined
203        /// behavior and will very likely cause a crash.
204        pub mod avx2 {
205            pub use crate::implementation::x86::avx2::validate_utf8_basic as validate_utf8;
206            pub use crate::implementation::x86::avx2::ChunkedUtf8ValidatorImp;
207            pub use crate::implementation::x86::avx2::Utf8ValidatorImp;
208        }
209        /// Includes the validation implementation for SSE 4.2-compatible CPUs.
210        ///
211        /// Using the provided functionality on CPUs which do not support SSE 4.2 is undefined
212        /// behavior and will very likely cause a crash.
213        pub mod sse42 {
214            pub use crate::implementation::x86::sse42::validate_utf8_basic as validate_utf8;
215            pub use crate::implementation::x86::sse42::ChunkedUtf8ValidatorImp;
216            pub use crate::implementation::x86::sse42::Utf8ValidatorImp;
217        }
218    }
219
220    /// Includes the aarch64 SIMD implementations.
221    #[cfg(all(feature = "aarch64_neon", target_arch = "aarch64"))]
222    pub mod aarch64 {
223        /// Includes the Neon-based validation implementation for aarch64 CPUs.
224        ///
225        /// Should be supported on all ARM64 CPUSs. If it is not supported by the operating
226        /// system using it is undefined behavior and will likely cause a crash.
227        pub mod neon {
228            pub use crate::implementation::aarch64::neon::validate_utf8_basic as validate_utf8;
229            pub use crate::implementation::aarch64::neon::ChunkedUtf8ValidatorImp;
230            pub use crate::implementation::aarch64::neon::Utf8ValidatorImp;
231        }
232    }
233
234    /// Includes the wasm32 SIMD implementations.
235    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
236    pub mod wasm32 {
237        /// Includes the simd128-based validation implementation for WASM runtimes.
238        ///
239        /// Using the provided functionality on WASM runtimes that do not support SIMD
240        /// instructions will likely cause a crash.
241        pub mod simd128 {
242            pub use crate::implementation::wasm32::simd128::validate_utf8_basic as validate_utf8;
243            pub use crate::implementation::wasm32::simd128::ChunkedUtf8ValidatorImp;
244            pub use crate::implementation::wasm32::simd128::Utf8ValidatorImp;
245        }
246    }
247}