simdutf8/basic.rs
1//! The `basic` API flavor provides barebones UTF-8 checking at the highest speed.
2//!
3//! It is fastest on valid UTF-8, but only checks for errors after processing the whole byte sequence
4//! and does not provide detailed information if the data is not valid UTF-8. [`Utf8Error`] is a zero-sized error struct.
5//!
6//! If you need detailed error information use the functions from the [`crate::compat`] module instead.
7
8use core::str::{from_utf8_unchecked, from_utf8_unchecked_mut};
9
10use crate::implementation::validate_utf8_basic;
11
12/// Simple zero-sized UTF-8 error.
13///
14/// No information is provided where the error occurred or how long the invalid byte
15/// byte sequence is.
16#[derive(Copy, Eq, PartialEq, Clone, Debug)]
17pub struct Utf8Error;
18
19impl core::fmt::Display for Utf8Error {
20 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
21 f.write_str("invalid utf-8 sequence")
22 }
23}
24
25#[cfg(feature = "std")]
26impl std::error::Error for Utf8Error {}
27
28/// Analogue to [`std::str::from_utf8()`].
29///
30/// Checks if the passed byte sequence is valid UTF-8 and returns an
31/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
32///
33/// # Errors
34/// Will return the zero-sized Err([`Utf8Error`]) on if the input contains invalid UTF-8.
35#[inline]
36pub fn from_utf8(input: &[u8]) -> Result<&str, Utf8Error> {
37 unsafe {
38 validate_utf8_basic(input)?;
39 Ok(from_utf8_unchecked(input))
40 }
41}
42
43/// Analogue to [`std::str::from_utf8_mut()`].
44///
45/// Checks if the passed mutable byte sequence is valid UTF-8 and returns a mutable
46/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
47///
48/// # Errors
49/// Will return the zero-sized Err([`Utf8Error`]) on if the input contains invalid UTF-8.
50#[inline]
51pub fn from_utf8_mut(input: &mut [u8]) -> Result<&mut str, Utf8Error> {
52 unsafe {
53 validate_utf8_basic(input)?;
54 Ok(from_utf8_unchecked_mut(input))
55 }
56}
57
58/// Allows direct access to the platform-specific unsafe validation implementations.
59#[cfg(feature = "public_imp")]
60pub mod imp {
61 use crate::basic;
62
63 /// A low-level interface for streaming validation of UTF-8 data. It is meant to be integrated
64 /// in high-performance data processing pipelines.
65 ///
66 /// Data can be streamed in arbitrarily-sized chunks using the [`Self::update()`] method. There is
67 /// no way to find out if the input so far was valid UTF-8 during the validation. Only when
68 /// the validation is completed with the [`Self::finalize()`] method the result of the validation is
69 /// returned. Use [`ChunkedUtf8Validator`] if possible for highest performance.
70 ///
71 /// This implementation requires CPU SIMD features specified by the module it resides in.
72 /// It is undefined behavior to use it if the required CPU features are not available which
73 /// is why all trait methods are `unsafe`.
74 ///
75 /// General usage:
76 /// ```rust
77 /// use simdutf8::basic::imp::Utf8Validator;
78 /// use std::io::{stdin, Read, Result};
79 ///
80 /// # #[cfg(target_arch = "x86_64")]
81 /// fn main() -> Result<()> {
82 /// unsafe {
83 /// if !std::is_x86_feature_detected!("avx2") {
84 /// panic!("This example only works with CPUs supporting AVX 2");
85 /// }
86 ///
87 /// let mut validator = simdutf8::basic::imp::x86::avx2::Utf8ValidatorImp::new();
88 /// let mut buf = vec![0; 8192];
89 /// loop {
90 /// let bytes_read = stdin().read(buf.as_mut())?;
91 /// if bytes_read == 0 {
92 /// break;
93 /// }
94 /// validator.update(&buf);
95 /// }
96 ///
97 /// if validator.finalize().is_ok() {
98 /// println!("Input is valid UTF-8");
99 /// } else {
100 /// println!("Input is not valid UTF-8");
101 /// }
102 /// }
103 ///
104 /// Ok(())
105 /// }
106 ///
107 /// # #[cfg(not(target_arch = "x86_64"))]
108 /// # fn main() { }
109 /// ```
110 ///
111 pub trait Utf8Validator {
112 /// Creates a new validator.
113 ///
114 /// # Safety
115 /// This implementation requires CPU SIMD features specified by the module it resides in.
116 /// It is undefined behavior to call it if the required CPU features are not available.
117 #[must_use]
118 unsafe fn new() -> Self
119 where
120 Self: Sized;
121
122 /// Updates the validator with `input`.
123 ///
124 /// # Safety
125 /// This implementation requires CPU SIMD features specified by the module it resides in.
126 /// It is undefined behavior to call it if the required CPU features are not available.
127 unsafe fn update(&mut self, input: &[u8]);
128
129 /// Finishes the validation and returns `Ok(())` if the input was valid UTF-8.
130 ///
131 /// # Errors
132 /// A [`basic::Utf8Error`] is returned if the input was not valid UTF-8. No
133 /// further information about the location of the error is provided.
134 ///
135 /// # Safety
136 /// This implementation requires CPU SIMD features specified by the module it resides in.
137 /// It is undefined behavior to call it if the required CPU features are not available.
138 unsafe fn finalize(self) -> core::result::Result<(), basic::Utf8Error>;
139 }
140
141 /// Like [`Utf8Validator`] this low-level API is for streaming validation of UTF-8 data.
142 ///
143 /// It has additional restrictions imposed on how the input is passed in to allow
144 /// validation with as little overhead as possible.
145 ///
146 /// To feed it data you need to call the [`Self::update_from_chunks()`] method which takes slices which
147 /// have to be a multiple of 64 bytes long. The method will panic otherwise. There is
148 /// no way to find out if the input so far was valid UTF-8 during the validation. Only when
149 /// the validation is completed with the [`Self::finalize()`] method the result of the validation is
150 /// returned.
151 ///
152 /// The `Self::finalize()` method can be fed the rest of the data. There is no restriction on the
153 /// data passed to it.
154 ///
155 /// This implementation requires CPU SIMD features specified by the module it resides in.
156 /// It is undefined behavior to use it if the required CPU features are not available which
157 /// is why all trait methods are `unsafe`.
158 pub trait ChunkedUtf8Validator {
159 /// Creates a new validator.
160 ///
161 /// # Safety
162 /// This implementation requires CPU SIMD features specified by the module it resides in.
163 /// It is undefined behavior to call it if the required CPU features are not available.
164 #[must_use]
165 unsafe fn new() -> Self
166 where
167 Self: Sized;
168
169 /// Updates the validator with `input`.
170 ///
171 /// # Panics
172 /// If `input.len()` is not a multiple of 64.
173 ///
174 /// # Safety
175 /// This implementation requires CPU SIMD features specified by the module it resides in.
176 /// It is undefined behavior to call it if the required CPU features are not available.
177 unsafe fn update_from_chunks(&mut self, input: &[u8]);
178
179 /// Updates the validator with remaining input if any. There is no restriction on the
180 /// data provided.
181 ///
182 /// Finishes the validation and returns `Ok(())` if the input was valid UTF-8.
183 ///
184 /// # Errors
185 /// A [`basic::Utf8Error`] is returned if the input was not valid UTF-8. No
186 /// further information about the location of the error is provided.
187 ///
188 /// # Safety
189 /// This implementation requires CPU SIMD features specified by the module it resides in.
190 /// It is undefined behavior to call it if the required CPU features are not available.
191 unsafe fn finalize(
192 self,
193 remaining_input: core::option::Option<&[u8]>,
194 ) -> core::result::Result<(), basic::Utf8Error>;
195 }
196
197 /// Includes the x86/x86-64 SIMD implementations.
198 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
199 pub mod x86 {
200 /// Includes the validation implementation for AVX 2-compatible CPUs.
201 ///
202 /// Using the provided functionality on CPUs which do not support AVX 2 is undefined
203 /// behavior and will very likely cause a crash.
204 pub mod avx2 {
205 pub use crate::implementation::x86::avx2::validate_utf8_basic as validate_utf8;
206 pub use crate::implementation::x86::avx2::ChunkedUtf8ValidatorImp;
207 pub use crate::implementation::x86::avx2::Utf8ValidatorImp;
208 }
209 /// Includes the validation implementation for SSE 4.2-compatible CPUs.
210 ///
211 /// Using the provided functionality on CPUs which do not support SSE 4.2 is undefined
212 /// behavior and will very likely cause a crash.
213 pub mod sse42 {
214 pub use crate::implementation::x86::sse42::validate_utf8_basic as validate_utf8;
215 pub use crate::implementation::x86::sse42::ChunkedUtf8ValidatorImp;
216 pub use crate::implementation::x86::sse42::Utf8ValidatorImp;
217 }
218 }
219
220 /// Includes the aarch64 SIMD implementations.
221 #[cfg(all(feature = "aarch64_neon", target_arch = "aarch64"))]
222 pub mod aarch64 {
223 /// Includes the Neon-based validation implementation for aarch64 CPUs.
224 ///
225 /// Should be supported on all ARM64 CPUSs. If it is not supported by the operating
226 /// system using it is undefined behavior and will likely cause a crash.
227 pub mod neon {
228 pub use crate::implementation::aarch64::neon::validate_utf8_basic as validate_utf8;
229 pub use crate::implementation::aarch64::neon::ChunkedUtf8ValidatorImp;
230 pub use crate::implementation::aarch64::neon::Utf8ValidatorImp;
231 }
232 }
233
234 /// Includes the wasm32 SIMD implementations.
235 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
236 pub mod wasm32 {
237 /// Includes the simd128-based validation implementation for WASM runtimes.
238 ///
239 /// Using the provided functionality on WASM runtimes that do not support SIMD
240 /// instructions will likely cause a crash.
241 pub mod simd128 {
242 pub use crate::implementation::wasm32::simd128::validate_utf8_basic as validate_utf8;
243 pub use crate::implementation::wasm32::simd128::ChunkedUtf8ValidatorImp;
244 pub use crate::implementation::wasm32::simd128::Utf8ValidatorImp;
245 }
246 }
247}