simdutf8/
compat.rs

1//! The `compat` API flavor provides full compatibility with [`std::str::from_utf8()`] and detailed validation errors.
2//!
3//! In particular, [`from_utf8()`]
4//! returns an [`Utf8Error`], which has the [`valid_up_to()`](Utf8Error#method.valid_up_to) and
5//! [`error_len()`](Utf8Error#method.error_len) methods. The first is useful for verification of streamed data. The
6//! second is useful e.g. for replacing invalid byte sequences with a replacement character.
7//!
8//! The functions in this module also fail early: errors are checked on-the-fly as the string is processed and once
9//! an invalid UTF-8 sequence is encountered, it returns without processing the rest of the data.
10//! This comes at a slight performance penalty compared to the [`crate::basic`] module if the input is valid UTF-8.
11
12use core::fmt::Display;
13use core::fmt::Formatter;
14
15use core::str::{from_utf8_unchecked, from_utf8_unchecked_mut};
16
17use crate::implementation::validate_utf8_compat;
18
19/// UTF-8 error information compatible with [`std::str::Utf8Error`].
20///
21/// Contains information on the location of the encountered validation error and the length of the
22/// invalid UTF-8 sequence.
23#[derive(Copy, Eq, PartialEq, Clone, Debug)]
24pub struct Utf8Error {
25    pub(crate) valid_up_to: usize,
26    pub(crate) error_len: Option<u8>,
27}
28
29impl Utf8Error {
30    /// Analogue to [`std::str::Utf8Error::valid_up_to()`](std::str::Utf8Error#method.valid_up_to).
31    ///
32    /// ...
33    #[inline]
34    #[must_use]
35    pub fn valid_up_to(&self) -> usize {
36        self.valid_up_to
37    }
38
39    /// Analogue to [`std::str::Utf8Error::error_len()`](std::str::Utf8Error#method.error_len).
40    ///
41    /// ...
42    #[inline]
43    #[must_use]
44    pub fn error_len(&self) -> Option<usize> {
45        self.error_len.map(|len| len as usize)
46    }
47}
48
49impl Display for Utf8Error {
50    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
51        if let Some(error_len) = self.error_len {
52            write!(
53                f,
54                "invalid utf-8 sequence of {} bytes from index {}",
55                error_len, self.valid_up_to
56            )
57        } else {
58            write!(
59                f,
60                "incomplete utf-8 byte sequence from index {}",
61                self.valid_up_to
62            )
63        }
64    }
65}
66
67#[cfg(feature = "std")]
68impl std::error::Error for Utf8Error {}
69
70/// Analogue to [`std::str::from_utf8()`].
71///
72/// Checks if the passed byte sequence is valid UTF-8 and returns an
73/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
74///
75/// # Errors
76/// Will return Err([`Utf8Error`]) on if the input contains invalid UTF-8 with
77/// detailed error information.
78#[inline]
79pub fn from_utf8(input: &[u8]) -> Result<&str, Utf8Error> {
80    unsafe {
81        validate_utf8_compat(input)?;
82        Ok(from_utf8_unchecked(input))
83    }
84}
85
86/// Analogue to [`std::str::from_utf8_mut()`].
87///
88/// Checks if the passed mutable byte sequence is valid UTF-8 and returns a mutable
89/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
90///
91/// # Errors
92/// Will return Err([`Utf8Error`]) on if the input contains invalid UTF-8 with
93/// detailed error information.
94#[inline]
95pub fn from_utf8_mut(input: &mut [u8]) -> Result<&mut str, Utf8Error> {
96    unsafe {
97        validate_utf8_compat(input)?;
98        Ok(from_utf8_unchecked_mut(input))
99    }
100}
101
102/// Allows direct access to the platform-specific unsafe validation implementations.
103#[cfg(feature = "public_imp")]
104pub mod imp {
105    /// Includes the x86/x86-64 SIMD implementations.
106    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
107    pub mod x86 {
108        /// Includes the validation implementation for AVX 2-compatible CPUs.
109        pub mod avx2 {
110            pub use crate::implementation::x86::avx2::validate_utf8_compat as validate_utf8;
111        }
112        /// Includes the validation implementation for SSE 4.2-compatible CPUs.
113        pub mod sse42 {
114            pub use crate::implementation::x86::sse42::validate_utf8_compat as validate_utf8;
115        }
116    }
117
118    /// Includes the aarch64 SIMD implementations.
119    #[cfg(all(feature = "aarch64_neon", target_arch = "aarch64"))]
120    pub mod aarch64 {
121        /// Includes the validation implementation for Neon SIMD.
122        pub mod neon {
123            pub use crate::implementation::aarch64::neon::validate_utf8_compat as validate_utf8;
124        }
125    }
126
127    /// Includes the wasm32 SIMD implementations.
128    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
129    pub mod wasm32 {
130        /// Includes the validation implementation for WASM simd128.
131        pub mod simd128 {
132            pub use crate::implementation::wasm32::simd128::validate_utf8_compat as validate_utf8;
133        }
134    }
135}