simdutf8/compat.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
//! The `compat` API flavor provides full compatibility with [`std::str::from_utf8()`] and detailed validation errors.
//!
//! In particular, [`from_utf8()`]
//! returns an [`Utf8Error`], which has the [`valid_up_to()`](Utf8Error#method.valid_up_to) and
//! [`error_len()`](Utf8Error#method.error_len) methods. The first is useful for verification of streamed data. The
//! second is useful e.g. for replacing invalid byte sequences with a replacement character.
//!
//! The functions in this module also fail early: errors are checked on-the-fly as the string is processed and once
//! an invalid UTF-8 sequence is encountered, it returns without processing the rest of the data.
//! This comes at a slight performance penality compared to the [`crate::basic`] module if the input is valid UTF-8.
use core::fmt::Display;
use core::fmt::Formatter;
use core::str::{from_utf8_unchecked, from_utf8_unchecked_mut};
use crate::implementation::validate_utf8_compat;
/// UTF-8 error information compatible with [`std::str::Utf8Error`].
///
/// Contains information on the location of the encountered validation error and the length of the
/// invalid UTF-8 sequence.
#[derive(Copy, Eq, PartialEq, Clone, Debug)]
pub struct Utf8Error {
pub(crate) valid_up_to: usize,
pub(crate) error_len: Option<u8>,
}
impl Utf8Error {
/// Analogue to [`std::str::Utf8Error::valid_up_to()`](std::str::Utf8Error#method.valid_up_to).
///
/// ...
#[inline]
#[must_use]
#[allow(clippy::missing_const_for_fn)] // would not provide any benefit
pub fn valid_up_to(&self) -> usize {
self.valid_up_to
}
/// Analogue to [`std::str::Utf8Error::error_len()`](std::str::Utf8Error#method.error_len).
///
/// ...
#[inline]
#[must_use]
pub fn error_len(&self) -> Option<usize> {
self.error_len.map(|len| len as usize)
}
}
impl Display for Utf8Error {
fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
if let Some(error_len) = self.error_len {
write!(
f,
"invalid utf-8 sequence of {} bytes from index {}",
error_len, self.valid_up_to
)
} else {
write!(
f,
"incomplete utf-8 byte sequence from index {}",
self.valid_up_to
)
}
}
}
#[cfg(feature = "std")]
impl std::error::Error for Utf8Error {}
/// Analogue to [`std::str::from_utf8_mut()`].
///
/// Checks if the passed byte sequence is valid UTF-8 and returns an
/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
///
/// # Errors
/// Will return Err([`Utf8Error`]) on if the input contains invalid UTF-8 with
/// detailed error information.
#[inline]
pub fn from_utf8(input: &[u8]) -> Result<&str, Utf8Error> {
unsafe {
validate_utf8_compat(input)?;
Ok(from_utf8_unchecked(input))
}
}
/// Analogue to [`std::str::from_utf8_mut()`].
///
/// Checks if the passed mutable byte sequence is valid UTF-8 and returns a mutable
/// [`std::str`] reference to the passed byte slice wrapped in `Ok()` if it is.
///
/// # Errors
/// Will return Err([`Utf8Error`]) on if the input contains invalid UTF-8 with
/// detailed error information.
#[inline]
pub fn from_utf8_mut(input: &mut [u8]) -> Result<&mut str, Utf8Error> {
unsafe {
validate_utf8_compat(input)?;
Ok(from_utf8_unchecked_mut(input))
}
}
/// Allows direct access to the platform-specific unsafe validation implementations.
#[cfg(feature = "public_imp")]
pub mod imp {
/// Includes the x86/x86-64 SIMD implementations.
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))]
pub mod x86 {
/// Includes the validation implementation for AVX 2-compatible CPUs.
pub mod avx2 {
pub use crate::implementation::x86::avx2::validate_utf8_compat as validate_utf8;
}
/// Includes the validation implementation for SSE 4.2-compatible CPUs.
pub mod sse42 {
pub use crate::implementation::x86::sse42::validate_utf8_compat as validate_utf8;
}
}
/// Includes the aarch64 SIMD implementations.
#[cfg(all(feature = "aarch64_neon", target_arch = "aarch64"))]
pub mod aarch64 {
/// Includes the validation implementation for Neon SIMD.
pub mod neon {
pub use crate::implementation::aarch64::neon::validate_utf8_compat as validate_utf8;
}
}
/// Includes the wasm32 SIMD implementations.
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub mod wasm32 {
/// Includes the validation implementation for WASM simd128.
pub mod simd128 {
pub use crate::implementation::wasm32::simd128::validate_utf8_compat as validate_utf8;
}
}
}