spinoso_string/enc/utf8/
owned.rs

1use alloc::collections::TryReserveError;
2use alloc::vec::Vec;
3
4use scolapasta_strbuf::Buf;
5
6use super::Utf8Str;
7use crate::case_folding::CaseFoldingEffect;
8use crate::chars::ConventionallyUtf8;
9use crate::codepoints::InvalidCodepointError;
10use crate::enc::utf8::case_change;
11use crate::iter::IntoIter;
12
13mod eq;
14mod impls;
15#[cfg(feature = "std")]
16mod io;
17
18#[repr(transparent)]
19#[derive(Hash, PartialEq, Eq, PartialOrd, Ord)]
20pub struct Utf8String {
21    inner: Buf,
22}
23
24// Constructors
25impl Utf8String {
26    #[inline]
27    pub const fn new(buf: Buf) -> Self {
28        Self { inner: buf }
29    }
30
31    #[inline]
32    pub fn empty() -> Self {
33        Self { inner: Buf::new() }
34    }
35}
36
37// Raw
38impl Utf8String {
39    #[inline]
40    #[must_use]
41    pub(crate) fn into_buf(self) -> Buf {
42        self.inner
43    }
44
45    #[inline]
46    #[must_use]
47    pub fn as_utf8_str(&self) -> &Utf8Str {
48        Utf8Str::from_bytes(self.inner.as_slice())
49    }
50
51    #[inline]
52    #[must_use]
53    pub fn as_mut_utf8_str(&mut self) -> &mut Utf8Str {
54        Utf8Str::from_bytes_mut(self.inner.as_mut_slice())
55    }
56}
57
58// Core Iterators
59impl Utf8String {
60    #[inline]
61    #[must_use]
62    pub fn into_iter(self) -> IntoIter {
63        IntoIter::from_vec(self.inner.into_inner())
64    }
65}
66
67// Size and Capacity
68impl Utf8String {
69    #[inline]
70    pub unsafe fn set_len(&mut self, len: usize) {
71        // SAFETY: The caller must uphold the documented safety contract, which
72        // is the same as the owned UTF-8 str's inner buffer.
73        unsafe {
74            self.inner.set_len(len);
75        }
76    }
77
78    #[inline]
79    #[must_use]
80    pub fn capacity(&self) -> usize {
81        self.inner.capacity()
82    }
83
84    #[inline]
85    pub fn clear(&mut self) {
86        self.inner.clear();
87    }
88
89    #[inline]
90    pub fn truncate(&mut self, len: usize) {
91        self.inner.truncate(len);
92    }
93}
94
95// Memory management
96impl Utf8String {
97    #[inline]
98    pub fn reserve(&mut self, additional: usize) {
99        self.inner.reserve(additional);
100    }
101
102    #[inline]
103    pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
104        self.inner.try_reserve(additional)
105    }
106
107    #[inline]
108    pub fn reserve_exact(&mut self, additional: usize) {
109        self.inner.reserve_exact(additional);
110    }
111
112    #[inline]
113    pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
114        self.inner.try_reserve_exact(additional)
115    }
116
117    #[inline]
118    pub fn shrink_to_fit(&mut self) {
119        self.inner.shrink_to_fit();
120    }
121
122    #[inline]
123    pub fn shrink_to(&mut self, min_capacity: usize) {
124        self.inner.shrink_to(min_capacity);
125    }
126}
127
128// Pushing and popping bytes, codepoints, and strings.
129impl Utf8String {
130    #[inline]
131    pub fn push_byte(&mut self, byte: u8) {
132        self.inner.push_byte(byte);
133    }
134
135    #[inline]
136    pub fn try_push_codepoint(&mut self, codepoint: i64) -> Result<(), InvalidCodepointError> {
137        let codepoint = if let Ok(codepoint) = u32::try_from(codepoint) {
138            codepoint
139        } else {
140            return Err(InvalidCodepointError::codepoint_out_of_range(codepoint));
141        };
142        if let Ok(ch) = char::try_from(codepoint) {
143            self.push_char(ch);
144            Ok(())
145        } else {
146            Err(InvalidCodepointError::invalid_utf8_codepoint(codepoint))
147        }
148    }
149
150    #[inline]
151    pub fn try_push_int(&mut self, int: i64) -> Result<(), InvalidCodepointError> {
152        self.try_push_codepoint(int)
153    }
154
155    #[inline]
156    pub fn push_char(&mut self, ch: char) {
157        self.inner.push_char(ch);
158    }
159
160    #[inline]
161    pub fn push_str(&mut self, s: &str) {
162        self.inner.push_str(s);
163    }
164
165    #[inline]
166    pub fn extend_from_slice(&mut self, other: &[u8]) {
167        self.inner.extend_from_slice(other);
168    }
169}
170
171// Casing
172//
173// TODO: Use `roe` for case changing operations. UTF-8 case changing needs to be
174// parameterized on the case folding strategy to account for e.g. Turkic or
175// ASCII-only modes.
176//
177// https://github.com/artichoke/artichoke/issues/1723
178impl Utf8String {
179    #[inline]
180    pub fn make_capitalized(&mut self) -> CaseFoldingEffect {
181        let (replacement, effect) = case_change::to_utf8_capitalized(self.as_bytes());
182        // Replace the old buffer with the new swapped buffer
183        self.inner = replacement.into();
184        effect
185    }
186
187    #[inline]
188    pub fn make_lowercase(&mut self) -> CaseFoldingEffect {
189        let (replacement, effect) = case_change::to_utf8_lowercase(self.as_bytes());
190        // Replace the old buffer with the new swapped buffer
191        self.inner = replacement.into();
192        effect
193    }
194
195    #[inline]
196    pub fn make_uppercase(&mut self) -> CaseFoldingEffect {
197        let (replacement, effect) = case_change::to_utf8_uppercase(self.as_bytes());
198        // Replace the old buffer with the new swapped buffer
199        self.inner = replacement.into();
200        effect
201    }
202
203    #[inline]
204    pub fn make_swapcase(&mut self) -> CaseFoldingEffect {
205        let (replacement, effect) = case_change::to_utf8_swapcase(self.as_bytes());
206        // Replace the old buffer with the new swapped buffer
207        self.inner = replacement.into();
208        effect
209    }
210}
211
212// Reversing
213impl Utf8String {
214    #[inline]
215    pub fn reverse(&mut self) {
216        // Fast path when all characters are one byte wide.
217        if self.is_ascii_only() {
218            self.inner.reverse();
219            return;
220        }
221        // FIXME: this allocation can go away if `ConventionallyUtf8` impls
222        // `DoubleEndedIterator`.
223        let chars = ConventionallyUtf8::from(&self.inner[..]).collect::<Vec<_>>();
224        // Use a `Vec` here instead of a `Buf` to ensure at most one alloc
225        // fix-up happens instead of alloc fix-ups being O(chars).
226        let mut replacement = Vec::with_capacity(self.inner.len());
227        for &bytes in chars.iter().rev() {
228            replacement.extend_from_slice(bytes);
229        }
230        self.inner = replacement.into();
231    }
232}
233
234#[cfg(test)]
235mod tests {
236    use bstr::ByteSlice;
237
238    use super::Utf8String;
239
240    #[test]
241    fn reverse_ascii() {
242        let mut s = Utf8String::from("1234");
243        s.reverse();
244        assert_eq!(s, "4321");
245    }
246
247    #[test]
248    fn reverse_ascii_with_invalid_utf8() {
249        let mut s = Utf8String::from(b"1234\xFF\xFE");
250        s.reverse();
251        assert_eq!(s, b"\xFE\xFF4321".as_bstr());
252    }
253
254    #[test]
255    fn reverse_multibyte() {
256        // ```console
257        // [3.2.2] > "怎么样".reverse
258        // => "样么怎"
259        // ```
260        let mut s = Utf8String::from("怎么样");
261        s.reverse();
262        assert_eq!(s, "样么怎");
263    }
264
265    #[test]
266    fn reverse_multibyte_with_invalid_utf8() {
267        // ```console
268        // [3.2.2] > "怎么样\xFF\xFE".reverse
269        // => => "\xFE\xFF样么怎"
270        // ```
271        let mut s = Utf8String::from("怎么样");
272        s.extend_from_slice(b"\xFF\xFE");
273        s.reverse();
274
275        let mut expected = b"\xFE\xFF".to_vec();
276        expected.extend_from_slice("样么怎".as_bytes());
277        assert_eq!(s, expected.as_bstr());
278    }
279
280    #[test]
281    fn reverse_replacement_char_with_invalid_utf8_prefix() {
282        // the Unicode replacement char has the following byte contents:
283        //
284        // ```console
285        // [3.2.2] > puts "�".b.inspect
286        // "\xEF\xBF\xBD"
287        // ```
288        //
289        // `\xF0\x9F\x87` is a valid UTF-8 prefix for a 4 byte sequence but is
290        // not itself a valid byte sequence. We expect these 3 bytes to be
291        // treated as 3 characters.
292        let mut s = Utf8String::from(b"abc\xF0\x9F\x87def\xEF\xBF\xBD");
293        s.reverse();
294        assert_eq!(s, b"\xEF\xBF\xBDfed\x87\x9F\xF0cba".as_bstr());
295    }
296}