simdutf8/compat.rs
1//! The `compat` API flavor provides full compatibility with [`std::str::from_utf8()`] and detailed validation errors.
2//!
3//! In particular, [`from_utf8()`]
4//! returns an [`Utf8Error`], which has the [`valid_up_to()`](Utf8Error#method.valid_up_to) and
5//! [`error_len()`](Utf8Error#method.error_len) methods. The first is useful for verification of streamed data. The
6//! second is useful e.g. for replacing invalid byte sequences with a replacement character.
7//!
8//! The functions in this module also fail early: errors are checked on-the-fly as the string is processed and once
9//! an invalid UTF-8 sequence is encountered, it returns without processing the rest of the data.
10//! This comes at a slight performance penalty compared to the [`crate::basic`] module if the input is valid UTF-8.
11
12use core::fmt::Display;
13use core::fmt::Formatter;
14
15use core::str::{from_utf8_unchecked, from_utf8_unchecked_mut};
16
17use crate::implementation::validate_utf8_compat;
18
19/// UTF-8 error information compatible with [`std::str::Utf8Error`].
20///
21/// Contains information on the location of the encountered validation error and the length of the
22/// invalid UTF-8 sequence.
23#[derive(Copy, Eq, PartialEq, Clone, Debug)]
24pub struct Utf8Error {
25 pub(crate) valid_up_to: usize,
26 pub(crate) error_len: Option<u8>,
27}
28
29impl Utf8Error {
30 /// Analogue to [`std::str::Utf8Error::valid_up_to()`](std::str::Utf8Error#method.valid_up_to).
31 ///
32 /// ...
33 #[inline]
34 #[must_use]
35 pub fn valid_up_to(&self) -> usize {
36 self.valid_up_to
37 }
38
39 /// Analogue to [`std::str::Utf8Error::error_len()`](std::str::Utf8Error#method.error_len).
40 ///
41 /// ...
42 #[inline]
43 #[must_use]
44 pub fn error_len(&self) -> Option<usize> {
45 self.error_len.map(|len| len as usize)
46 }
47}
48
49impl Display for Utf8Error {
50 fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
51 if let Some(error_len) = self.error_len {
52 write!(
53 f,
54 "invalid utf-8 sequence of {} bytes from index {}",
55 error_len, self.valid_up_to
56 )
57 } else {
58 write!(
59 f,
60 "incomplete utf-8 byte sequence from index {}",
61 self.valid_up_to
62 )
63 }
64 }
65}
66
67#[cfg(feature = "std")]
68impl std::error::Error for Utf8Error {}
69
70/// Analogue to [`std::str::from_utf8()`].
71///
72/// Checks if the passed byte sequence is valid UTF-8 and returns an
73/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
74///
75/// # Errors
76/// Will return Err([`Utf8Error`]) on if the input contains invalid UTF-8 with
77/// detailed error information.
78#[inline]
79pub fn from_utf8(input: &[u8]) -> Result<&str, Utf8Error> {
80 unsafe {
81 validate_utf8_compat(input)?;
82 Ok(from_utf8_unchecked(input))
83 }
84}
85
86/// Analogue to [`std::str::from_utf8_mut()`].
87///
88/// Checks if the passed mutable byte sequence is valid UTF-8 and returns a mutable
89/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
90///
91/// # Errors
92/// Will return Err([`Utf8Error`]) on if the input contains invalid UTF-8 with
93/// detailed error information.
94#[inline]
95pub fn from_utf8_mut(input: &mut [u8]) -> Result<&mut str, Utf8Error> {
96 unsafe {
97 validate_utf8_compat(input)?;
98 Ok(from_utf8_unchecked_mut(input))
99 }
100}
101
102/// Allows direct access to the platform-specific unsafe validation implementations.
103#[cfg(feature = "public_imp")]
104pub mod imp {
105 /// Includes the x86/x86-64 SIMD implementations.
106 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
107 pub mod x86 {
108 /// Includes the validation implementation for AVX 2-compatible CPUs.
109 pub mod avx2 {
110 pub use crate::implementation::x86::avx2::validate_utf8_compat as validate_utf8;
111 }
112 /// Includes the validation implementation for SSE 4.2-compatible CPUs.
113 pub mod sse42 {
114 pub use crate::implementation::x86::sse42::validate_utf8_compat as validate_utf8;
115 }
116 }
117
118 /// Includes the aarch64 SIMD implementations.
119 #[cfg(all(feature = "aarch64_neon", target_arch = "aarch64"))]
120 pub mod aarch64 {
121 /// Includes the validation implementation for Neon SIMD.
122 pub mod neon {
123 pub use crate::implementation::aarch64::neon::validate_utf8_compat as validate_utf8;
124 }
125 }
126
127 /// Includes the wasm32 SIMD implementations.
128 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
129 pub mod wasm32 {
130 /// Includes the validation implementation for WASM simd128.
131 pub mod simd128 {
132 pub use crate::implementation::wasm32::simd128::validate_utf8_compat as validate_utf8;
133 }
134 }
135}