spinoso_string/enc/utf8/
borrowed.rs

1use alloc::boxed::Box;
2use core::ops::Range;
3use core::slice::SliceIndex;
4
5use bstr::ByteSlice;
6
7use crate::iter::{Bytes, Iter, IterMut};
8use crate::ord::OrdError;
9
10mod codepoints;
11mod eq;
12mod impls;
13
14pub use codepoints::Codepoints;
15
16#[repr(transparent)]
17pub struct Utf8Str {
18    bytes: [u8],
19}
20
21impl Utf8Str {
22    #[inline]
23    #[must_use]
24    pub fn new<B: ?Sized + AsRef<[u8]>>(bytes: &B) -> &Utf8Str {
25        Utf8Str::from_bytes(bytes.as_ref())
26    }
27
28    #[inline]
29    #[must_use]
30    pub(crate) fn new_mut<B: ?Sized + AsMut<[u8]>>(bytes: &mut B) -> &mut Utf8Str {
31        Utf8Str::from_bytes_mut(bytes.as_mut())
32    }
33
34    #[inline]
35    #[must_use]
36    pub const fn empty() -> &'static Utf8Str {
37        Utf8Str::from_bytes(b"")
38    }
39
40    #[inline]
41    #[must_use]
42    pub const fn from_bytes(slice: &[u8]) -> &Utf8Str {
43        // SAFETY: `Utf8Str` is a `repr(transparent)` wrapper around `[u8]`.
44        unsafe {
45            let ptr: *const [u8] = slice;
46            let ptr = ptr as *const Utf8Str;
47            &*ptr
48        }
49    }
50
51    #[inline]
52    #[must_use]
53    pub fn from_bytes_mut(slice: &mut [u8]) -> &mut Utf8Str {
54        // SAFETY: `Utf8Str` is a `repr(transparent)` wrapper around `[u8]`.
55        unsafe {
56            let ptr: *mut [u8] = slice;
57            let ptr = ptr as *mut Utf8Str;
58            &mut *ptr
59        }
60    }
61
62    #[inline]
63    pub fn from_boxed_bytes(slice: Box<[u8]>) -> Box<Utf8Str> {
64        // SAFETY: `Utf8Str` is a `repr(transparent)` wrapper around `[u8]`.
65        unsafe { Box::from_raw(Box::into_raw(slice) as _) }
66    }
67
68    #[inline]
69    pub fn into_boxed_bytes(slice: Box<Utf8Str>) -> Box<[u8]> {
70        // SAFETY: `Utf8Str` is a `repr(transparent)` wrapper around `[u8]`.
71        unsafe { Box::from_raw(Box::into_raw(slice) as _) }
72    }
73
74    #[inline]
75    #[must_use]
76    pub const fn as_bytes(&self) -> &[u8] {
77        &self.bytes
78    }
79
80    #[inline]
81    #[must_use]
82    pub fn as_bytes_mut(&mut self) -> &mut [u8] {
83        &mut self.bytes
84    }
85}
86
87// Raw
88impl Utf8Str {
89    #[inline]
90    #[must_use]
91    pub fn as_ptr(&self) -> *const u8 {
92        self.as_bytes().as_ptr()
93    }
94
95    #[inline]
96    #[must_use]
97    pub fn as_mut_ptr(&mut self) -> *mut u8 {
98        self.as_bytes_mut().as_mut_ptr()
99    }
100}
101
102// Core Iterators
103impl Utf8Str {
104    #[inline]
105    #[must_use]
106    pub fn iter(&self) -> Iter<'_> {
107        Iter::from_slice(self.as_bytes())
108    }
109
110    #[inline]
111    #[must_use]
112    pub fn iter_mut(&mut self) -> IterMut<'_> {
113        IterMut::from_mut_slice(self.as_bytes_mut())
114    }
115
116    #[inline]
117    #[must_use]
118    pub fn bytes(&self) -> Bytes<'_> {
119        Bytes::from_slice(self.as_bytes())
120    }
121}
122
123// Size and Capacity
124impl Utf8Str {
125    #[inline]
126    #[must_use]
127    pub fn len(&self) -> usize {
128        self.as_bytes().len()
129    }
130
131    #[inline]
132    #[must_use]
133    pub fn is_empty(&self) -> bool {
134        self.as_bytes().is_empty()
135    }
136}
137
138// Character-oriented APIs
139impl Utf8Str {
140    #[must_use]
141    pub fn chr(&self) -> &Utf8Str {
142        let slice = self.as_bytes();
143        let prefix = match bstr::decode_utf8(slice) {
144            (Some(_), size) => size,
145            (None, 0) => return Utf8Str::empty(),
146            (None, _) => 1,
147        };
148        // SAFETY: the UTF-8 decode above guarantees the prefix length is a
149        // valid slice index.
150        let s = unsafe { self.get_unchecked(..prefix) };
151        Utf8Str::from_bytes(s)
152    }
153
154    pub fn ord(&self) -> Result<u32, OrdError> {
155        let (ch, size) = bstr::decode_utf8(self.as_bytes());
156        match ch {
157            // All `char`s are valid `u32`s
158            Some(ch) => Ok(u32::from(ch)),
159            None if size == 0 => Err(OrdError::empty_string()),
160            None => Err(OrdError::invalid_utf8_byte_sequence()),
161        }
162    }
163
164    #[must_use]
165    pub fn char_len(&self) -> usize {
166        let bytes = self.as_bytes();
167
168        let tail = if let Some(idx) = bytes.find_non_ascii_byte() {
169            idx
170        } else {
171            // The entire string is ASCII bytes, so fast-path return the slice
172            // length.
173            return bytes.len();
174        };
175
176        // SAFETY: `ByteSlice::find_non_ascii_byte` guarantees that the index is
177        // in range for slicing if `Some(_)` is returned.
178        let bytes = unsafe { bytes.get_unchecked(tail..) };
179
180        // if the tail is valid UTF-8, use a fast path by delegating to SIMD
181        // `bytecount` crate.
182        if simdutf8::basic::from_utf8(bytes).is_ok() {
183            return tail + bytecount::num_chars(bytes);
184        }
185
186        // Else fallback to decoding UTF-8 in chunks using `bstr`.
187        let mut char_len = tail;
188        for chunk in bytes.utf8_chunks() {
189            char_len += bytecount::num_chars(chunk.valid().as_bytes());
190            char_len += chunk.invalid().len();
191        }
192        char_len
193    }
194
195    #[must_use]
196    pub fn get_char(&self, index: usize) -> Option<&Utf8Str> {
197        // Fast path rejection for indexes beyond bytesize, which is cheap to
198        // retrieve.
199        if index >= self.len() {
200            return None;
201        }
202
203        let slice = self.as_bytes();
204        // Fast path for trying to treat the conventionally UTF-8 string as
205        // entirely ASCII.
206        //
207        // If the string is either all ASCII or all ASCII for a prefix of the
208        // string that contains the range we wish to slice, use byte slicing
209        // like `AsciiStr` and `BinaryStr` do.
210        let consumed = match slice.find_non_ascii_byte() {
211            // The string is entirely ASCII, so we can always use byte slicing
212            // to mean char slicing.
213            None => {
214                let s = slice.get(index..=index)?;
215                return Some(Utf8Str::from_bytes(s));
216            }
217            // The first non-ASCII character occurs beyond the index we wish to
218            // retrieve, so we can use byte slicing to mean char slicing.
219            Some(idx) if idx > index => {
220                let s = slice.get(index..=index)?;
221                return Some(Utf8Str::from_bytes(s));
222            }
223            // The first `idx` characters of the `Utf8Str` end at the `idx` byte
224            // position.
225            Some(idx) => idx,
226        };
227
228        // Discard the ASCII prefix and begin a forward search with a character-
229        // at-a-time decode.
230        //
231        // SAFETY: `find_non_ascii_byte` guarantees that when `Some(idx)` is
232        // returned, `idx` is a valid position in the slice.
233        let mut slice = unsafe { slice.get_unchecked(consumed..) };
234        // Count of "characters" remaining until the `index`th character.
235        let mut remaining = index - consumed;
236
237        // This loop will terminate when either:
238        //
239        // - It counts `index` number of characters.
240        // - It consumes the entire slice when scanning for the `index`th
241        //   character.
242        //
243        // The loop will advance by at least one byte every iteration.
244        loop {
245            match bstr::decode_utf8(slice) {
246                // `decode_utf8` only returns a 0 size when the slice is empty.
247                //
248                // If we've run out of slice while trying to find the `index`th
249                // character, the lookup fails and we return `nil`.
250                (_, 0) => return None,
251
252                // The next two arms mean we've reached the `index`th character.
253                // Either return the next valid UTF-8 character byte slice or,
254                // if the next bytes are an invalid UTF-8 sequence, the next byte.
255                (Some(_), size) if remaining == 0 => {
256                    // SAFETY: `decode_utf8` guarantees that the number of bytes
257                    // returned on a successful decode can be used to slice into
258                    // the given slice.
259                    let s = unsafe { slice.get_unchecked(..size) };
260                    return Some(Utf8Str::from_bytes(s));
261                }
262                (None, _) if remaining == 0 => {
263                    // SAFETY: `decode_utf8` guarantees unsuccessful decodes
264                    // consume 0..=3 bytes and size is guaranteed to be non-zero
265                    // per the first match arm.
266                    let s = unsafe { slice.get_unchecked(..1) };
267                    return Some(Utf8Str::from_bytes(s));
268                }
269
270                // We found a single UTF-8 encoded character keep track of the
271                // count and advance the substring to continue decoding.
272                (Some(_), size) => {
273                    // SAFETY: `decode_utf8` guarantees that at least `size`
274                    // bytes exist in the slice.
275                    slice = unsafe { slice.get_unchecked(size..) };
276                    remaining -= 1;
277                }
278
279                // The next two arms handle the case where we have encountered
280                // an invalid UTF-8 byte sequence.
281                //
282                // In this case, `decode_utf8` will return slices whose length
283                // is `1..=3`. The length of this slice is the number of
284                // "characters" we can advance the loop by.
285                //
286                // If the invalid UTF-8 sequence contains more bytes than we
287                // have remaining to get to the `index`th char, then the target
288                // character is inside the invalid UTF-8 sequence.
289                (None, size) if remaining < size => {
290                    // SAFETY: `decode_utf8` guarantees that at least `size`
291                    // bytes exist in the slice and we check that `remaining` is
292                    // less than `size`.
293                    let s = unsafe { slice.get_unchecked(remaining..=remaining) };
294                    return Some(Utf8Str::from_bytes(s));
295                }
296                // If there are more characters remaining than the number of
297                // bytes yielded in the invalid UTF-8 byte sequence, count
298                // `size` bytes and advance the slice to continue decoding.
299                (None, size) => {
300                    // SAFETY: `decode_utf8` guarantees that at least `size`
301                    // bytes exist in the slice.
302                    slice = unsafe { slice.get_unchecked(size..) };
303                    remaining -= size;
304                }
305            }
306        }
307    }
308
309    #[must_use]
310    pub fn get_char_slice(&self, range: Range<usize>) -> Option<&Utf8Str> {
311        let Range { start, end } = range;
312
313        // Fast path the lookup if the end of the range is before the start.
314        if end < start {
315            // Yes, these types of ranges are allowed and they return `""`.
316            //
317            // ```
318            // [3.0.1] > "aaa"[1..0]
319            // => ""
320            // [3.0.1] > "aaa"[2..0]
321            // => ""
322            // [3.0.1] > "aaa"[2..1]
323            // => ""
324            // [3.0.1] > "💎🦀😅"[2..1]
325            // => ""
326            // [3.0.1] > "💎🦀😅"[3..0]
327            // => ""
328            // ```
329            //
330            // but only if `start` is within the string.
331            //
332            // ```
333            // [3.0.1] > "aaa"[10..4]
334            // => nil
335            // [3.0.1] > "aaa"[10..0]
336            // => nil
337            // [3.0.1] > "💎🦀😅"[10..4]
338            // => nil
339            // [3.0.1] > "💎🦀😅"[10..0]
340            // => nil
341            // [3.0.1] > "💎🦀😅"[6..0]
342            // => nil
343            // [3.0.1] > "💎🦀😅"[4..0]
344            // => nil
345            // ```
346            //
347            // attempt to short-circuit with a cheap length retrieval
348            if start > self.len() || start > self.char_len() {
349                return None;
350            }
351            return Some(Utf8Str::empty());
352        }
353
354        // If the start of the range is beyond the character count of the
355        // string, the whole lookup must fail.
356        //
357        // Slice lookups where the start is just beyond the last character index
358        // always return an empty slice.
359        //
360        // ```
361        // [3.0.1] > "aaa"[10, 0]
362        // => nil
363        // [3.0.1] > "aaa"[10, 7]
364        // => nil
365        // [3.0.1] > "aaa"[3, 7]
366        // => ""
367        // [3.0.1] > "🦀💎"[2, 0]
368        // => ""
369        // [3.0.1] > "🦀💎"[3, 1]
370        // => nil
371        // [3.0.1] > "🦀💎"[2, 1]
372        // => ""
373        // ```
374        //
375        // Fast path rejection for indexes beyond bytesize, which is cheap to
376        // retrieve.
377        if start > self.len() {
378            return None;
379        }
380        match self.char_len() {
381            char_length if start > char_length => return None,
382            char_length if start == char_length => return Some(Utf8Str::empty()),
383            _ => {}
384        }
385
386        // The span is guaranteed to at least partially overlap now.
387        match end - start {
388            // Empty substrings are present in all strings, even empty ones.
389            //
390            // ```
391            // [3.0.1] > "aaa"[""]
392            // => ""
393            // [3.0.1] > ""[""]
394            // => ""
395            // [3.0.1] > ""[0, 0]
396            // => ""
397            // [3.0.1] > "aaa"[0, 0]
398            // => ""
399            // [3.0.1] > "aaa"[2, 0]
400            // => ""
401            // [3.0.1] > "🦀💎"[1, 0]
402            // => ""
403            // [3.0.1] > "🦀💎"[2, 0]
404            // => ""
405            // ```
406            0 => return Some(Utf8Str::empty()),
407            // Delegate to the specialized single char lookup, which allows the
408            // remainder of this routine to fall back to the general case of
409            // multi-character spans.
410            //
411            // ```
412            // [3.0.1] > "abc"[2, 1]
413            // => "c"
414            // [3.0.1] > "🦀💎"[1, 1]
415            // => "💎"
416            // ```
417            1 => return self.get_char(start),
418            _ => {}
419        }
420
421        let slice = self.as_bytes();
422
423        // Fast path for trying to treat the conventionally UTF-8 string
424        // as entirely ASCII.
425        //
426        // If the string is either all ASCII or all ASCII for the subset
427        // of the string we wish to slice, fallback to byte slicing as in
428        // the ASCII and binary fast path.
429        //
430        // Perform the same saturate-to-end slicing mechanism if `end`
431        // is beyond the character length of the string.
432        let consumed = match slice.find_non_ascii_byte() {
433            // The entire string is ASCII, so byte indexing <=> char
434            // indexing.
435            None => {
436                let s = slice.get(start..end).or_else(|| slice.get(start..))?;
437                return Some(Utf8Str::from_bytes(s));
438            }
439            // The whole substring we are interested in is ASCII, so
440            // byte indexing is still valid.
441            Some(non_ascii_byte_offset) if non_ascii_byte_offset > end => {
442                let s = self.get(start..end)?;
443                return Some(Utf8Str::from_bytes(s));
444            }
445            // We turn non-ASCII somewhere inside before the substring
446            // we're interested in, so consume that much.
447            Some(non_ascii_byte_offset) if non_ascii_byte_offset <= start => non_ascii_byte_offset,
448            // This means we turn non-ASCII somewhere inside the substring.
449            // Consume up to start.
450            Some(_) => start,
451        };
452
453        // Scan for the beginning of the slice
454        let mut slice = &slice[consumed..];
455        // Count of "characters" remaining until the `start`th character.
456        let mut remaining = start - consumed;
457
458        if remaining > 0 {
459            // This loop will terminate when either:
460            //
461            // - It counts `start` number of characters.
462            // - It consumes the entire slice when scanning for the
463            //   `start`th character.
464            //
465            // The loop will advance by at least one byte every iteration.
466            loop {
467                match bstr::decode_utf8(slice) {
468                    // If we've run out of slice while trying to find the
469                    // `start`th character, the lookup fails and we return `nil`.
470                    (_, 0) => return None,
471
472                    // We found a single UTF-8 encoded character. keep track
473                    // of the count and advance the substring to continue
474                    // decoding.
475                    //
476                    // If there's only one more to go, advance and stop the
477                    // loop.
478                    (Some(_), size) if remaining == 1 => {
479                        slice = &slice[size..];
480                        break;
481                    }
482                    // Otherwise, keep track of the character we observed and
483                    // advance the slice to continue decoding.
484                    (Some(_), size) => {
485                        slice = &slice[size..];
486                        remaining -= 1;
487                    }
488
489                    // The next two arms handle the case where we have
490                    // encountered an invalid UTF-8 byte sequence.
491                    //
492                    // In this case, `decode_utf8` will return slices whose
493                    // length is `1..=3`. The length of this slice is the
494                    // number of "characters" we can advance the loop by.
495                    //
496                    // If the invalid UTF-8 sequence contains more bytes
497                    // than we have remaining to get to the `start`th char,
498                    // then we can break the loop directly.
499                    (None, size) if remaining <= size => {
500                        slice = &slice[remaining..];
501                        break;
502                    }
503                    // If there are more characters remaining than the number
504                    // of bytes yielded in the invalid UTF-8 byte sequence,
505                    // count `size` bytes and advance the slice to continue
506                    // decoding.
507                    (None, size) => {
508                        slice = &slice[size..];
509                        remaining -= size;
510                    }
511                }
512            }
513        };
514
515        // Scan the slice for the span of characters we want to return.
516        remaining = end - start;
517        // We know `remaining` is not zero because we fast-pathed that
518        // case above.
519        debug_assert!(remaining > 0);
520
521        // keep track of the start of the substring from the `start`th
522        // character.
523        let substr = slice;
524
525        // This loop will terminate when either:
526        //
527        // - It counts the next `start - end` number of characters.
528        // - It consumes the entire slice when scanning for the `end`th
529        //   character.
530        //
531        // The loop will advance by at least one byte every iteration.
532        loop {
533            match bstr::decode_utf8(slice) {
534                // If we've run out of slice while trying to find the `end`th
535                // character, saturate the slice to the end of the string.
536                (_, 0) => return Some(Utf8Str::from_bytes(substr)),
537
538                // We found a single UTF-8 encoded character. keep track
539                // of the count and advance the substring to continue
540                // decoding.
541                //
542                // If there's only one more to go, advance and stop the
543                // loop.
544                (Some(_), size) if remaining == 1 => {
545                    // Push `endth` more positive because this match has
546                    // the effect of shrinking `slice`.
547                    let endth = substr.len() - slice.len() + size;
548                    let s = &substr[..endth];
549                    return Some(Utf8Str::from_bytes(s));
550                }
551                // Otherwise, keep track of the character we observed and
552                // advance the slice to continue decoding.
553                (Some(_), size) => {
554                    slice = &slice[size..];
555                    remaining -= 1;
556                }
557
558                // The next two arms handle the case where we have
559                // encountered an invalid UTF-8 byte sequence.
560                //
561                // In this case, `decode_utf8` will return slices whose
562                // length is `1..=3`. The length of this slice is the
563                // number of "characters" we can advance the loop by.
564                //
565                // If the invalid UTF-8 sequence contains more bytes
566                // than we have remaining to get to the `end`th char,
567                // then we can break the loop directly.
568                (None, size) if remaining <= size => {
569                    // For an explanation of this arithmetic:
570                    // If we're trying to slice:
571                    //
572                    // ```
573                    // s = "a\xF0\x9F\x87"
574                    // s[0, 2]
575                    // ```
576                    //
577                    // By the time we get to this branch in this loop:
578                    //
579                    // ```
580                    // substr = "a\xF0\x9F\x87"
581                    // slice = "\xF0\x9F\x87"
582                    // remaining = 1
583                    // ```
584                    //
585                    // We want to compute `endth == 2`:
586                    //
587                    //    2   =      4       -      3      +     1
588                    let endth = substr.len() - slice.len() + remaining;
589                    let s = &substr[..endth];
590                    return Some(Utf8Str::from_bytes(s));
591                }
592                // If there are more characters remaining than the number
593                // of bytes yielded in the invalid UTF-8 byte sequence,
594                // count `size` bytes and advance the slice to continue
595                // decoding.
596                (None, size) => {
597                    slice = &slice[size..];
598                    remaining -= size;
599                }
600            }
601        }
602    }
603}
604
605// Indexing
606impl Utf8Str {
607    #[inline]
608    #[must_use]
609    pub fn get<I>(&self, index: I) -> Option<&I::Output>
610    where
611        I: SliceIndex<[u8]>,
612    {
613        self.as_bytes().get(index)
614    }
615
616    #[inline]
617    #[must_use]
618    pub fn get_mut<I>(&mut self, index: I) -> Option<&mut I::Output>
619    where
620        I: SliceIndex<[u8]>,
621    {
622        self.as_bytes_mut().get_mut(index)
623    }
624
625    #[inline]
626    #[must_use]
627    pub unsafe fn get_unchecked<I>(&self, index: I) -> &I::Output
628    where
629        I: SliceIndex<[u8]>,
630    {
631        // SAFETY: The caller must uphold the documented safety contract, which
632        // is the same as the borrowed UTF-8 str's inner slice.
633        unsafe { self.as_bytes().get_unchecked(index) }
634    }
635
636    #[inline]
637    #[must_use]
638    pub unsafe fn get_unchecked_mut<I>(&mut self, index: I) -> &mut I::Output
639    where
640        I: SliceIndex<[u8]>,
641    {
642        // SAFETY: The caller must uphold the documented safety contract, which
643        // is the same as the borrowed UTF-8 str's inner slice.
644        unsafe { self.as_bytes_mut().get_unchecked_mut(index) }
645    }
646}
647
648// Encoding
649impl Utf8Str {
650    #[must_use]
651    pub fn is_ascii_only(&self) -> bool {
652        self.as_bytes().is_ascii()
653    }
654
655    #[must_use]
656    pub fn is_valid_encoding(&self) -> bool {
657        if self.is_ascii_only() {
658            return true;
659        }
660
661        simdutf8::basic::from_utf8(self.as_bytes()).is_ok()
662    }
663}
664
665// Slicing routines
666impl Utf8Str {
667    #[inline]
668    #[must_use]
669    pub fn starts_with(&self, slice: &[u8]) -> bool {
670        self.as_bytes().starts_with(slice)
671    }
672
673    #[inline]
674    #[must_use]
675    pub fn ends_with(&self, slice: &[u8]) -> bool {
676        self.as_bytes().ends_with(slice)
677    }
678}
679
680// Searching routines
681impl Utf8Str {
682    #[must_use]
683    pub fn index(&self, needle: &[u8], offset: usize) -> Option<usize> {
684        // Decode needle
685        // Needle containing any invalid UTF-8 should never match in MRI
686        //
687        // ```console
688        // [3.2.2] > s = "abc"
689        // => "abc"
690        // [3.2.2] > s.encoding
691        // => #<Encoding:UTF-8>
692        // [3.2.2] > s.index "\xFF"
693        // => nil
694        // [3.2.2] > s = "\xFF\xFE"
695        // => "\xFF\xFE"
696        // [3.2.2] > s.encoding
697        // => #<Encoding:UTF-8>
698        // [3.2.2] > s.index "\xFF"
699        // => nil
700        // [3.2.2] > s.index "\xFF".b
701        // (irb):14:in `index': incompatible character encodings: UTF-8 and ASCII-8BIT (Encoding::CompatibilityError)
702        //         from (irb):14:in `<main>'
703        //         from /usr/local/var/rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/irb-1.6.2/exe/irb:11:in `<top (required)>'
704        //         from /usr/local/var/rbenv/versions/3.2.2/bin/irb:25:in `load'
705        //         from /usr/local/var/rbenv/versions/3.2.2/bin/irb:25:in `<main>'
706        // ```
707        if !Utf8Str::from_bytes(needle).is_valid_encoding() {
708            return None;
709        }
710
711        let prefix = self.get_char_slice(0..offset)?;
712        let tail = &self[prefix.len()..];
713        let index = tail.as_bytes().find(needle)?;
714
715        let s = Utf8Str::from_bytes(&tail[..index]);
716        Some(offset + s.char_len())
717    }
718
719    #[must_use]
720    pub fn rindex(&self, needle: &[u8], offset: usize) -> Option<usize> {
721        // Decode needle
722        // Needle containing any invalid UTF-8 should never match in MRI
723        //
724        // ```console
725        // [3.2.2] > s = "abc"
726        // => "abc"
727        // [3.2.2] > s.encoding
728        // => #<Encoding:UTF-8>
729        // [3.2.2] > s.rindex "\xFF"
730        // => nil
731        // [3.2.2] > s = "\xFF\xFE"
732        // => "\xFF\xFE"
733        // [3.2.2] > s.encoding
734        // => #<Encoding:UTF-8>
735        // [3.2.2] > s.rindex "\xFF"
736        // => nil
737        // [3.2.2] > s.rindex "\xFF".b
738        // (irb):7:in `rindex': incompatible character encodings: UTF-8 and ASCII-8BIT (Encoding::CompatibilityError)
739        //         from (irb):7:in `<main>'
740        //         from /usr/local/var/rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/irb-1.6.2/exe/irb:11:in `<top (required)>'
741        //         from /usr/local/var/rbenv/versions/3.2.2/bin/irb:25:in `load'
742        //         from /usr/local/var/rbenv/versions/3.2.2/bin/irb:25:in `<main>'
743        // ```
744        if !needle.is_utf8() {
745            return None;
746        }
747
748        let endpoint = offset.saturating_add(1);
749        let buf = self.get_char_slice(0..endpoint).unwrap_or(self);
750        let index = buf.as_bytes().rfind(needle)?;
751        let s = Utf8Str::from_bytes(&buf[..index]);
752        Some(s.char_len())
753    }
754}
755
756#[cfg(test)]
757mod tests {
758    use alloc::string::String;
759    use core::fmt::Write;
760
761    use super::Utf8Str;
762
763    #[test]
764    fn empty_is_empty() {
765        let s = Utf8Str::empty();
766        assert_eq!(s.len(), 0);
767        assert_eq!(s.as_bytes(), &[]);
768    }
769
770    #[test]
771    fn default_is_empty() {
772        assert_eq!(Utf8Str::empty(), <&Utf8Str>::default());
773    }
774
775    #[test]
776    fn debug_is_not_empty() {
777        let s = Utf8Str::empty();
778        let mut buf = String::new();
779        write!(&mut buf, "{s:?}").unwrap();
780        assert!(!buf.is_empty());
781
782        let s = Utf8Str::new("abc");
783        let mut buf = String::new();
784        write!(&mut buf, "{s:?}").unwrap();
785        assert!(!buf.is_empty());
786        assert!(buf.contains(r#""abc""#));
787
788        let s = Utf8Str::new("🦀💎");
789        let mut buf = String::new();
790        write!(&mut buf, "{s:?}").unwrap();
791        assert!(!buf.is_empty());
792
793        let s = Utf8Str::new(b"\xFF\xFE");
794        let mut buf = String::new();
795        write!(&mut buf, "{s:?}").unwrap();
796        assert!(!buf.is_empty());
797    }
798
799    #[test]
800    fn debug_contains_readable_byte_contents() {
801        let s = Utf8Str::empty();
802        let mut buf = String::new();
803        write!(&mut buf, "{s:?}").unwrap();
804        assert!(buf.contains(r#""""#));
805
806        let s = Utf8Str::new("abc");
807        let mut buf = String::new();
808        write!(&mut buf, "{s:?}").unwrap();
809        assert!(buf.contains(r#""abc""#));
810
811        let s = Utf8Str::new("🦀💎");
812        let mut buf = String::new();
813        write!(&mut buf, "{s:?}").unwrap();
814        assert!(buf.contains(r#""🦀💎""#));
815
816        let s = Utf8Str::new(b"\xFF\xFE");
817        let mut buf = String::new();
818        write!(&mut buf, "{s:?}").unwrap();
819        assert!(buf.contains(r#""\xff\xfe""#));
820    }
821
822    #[test]
823    fn slice_indexing_is_byte_slicing() {
824        let s = Utf8Str::new("a🦀b💎c");
825        // individual bytes can be copied out of the string ref.
826        for idx in 0..s.len() {
827            let _: u8 = s[idx];
828        }
829
830        // slicing in the middle of multi-byte UTF-8 characters is fine.
831        for idx in 0..s.len() {
832            let _: &[u8] = &s[idx..=idx];
833        }
834        for idx in 0..s.len() - 1 {
835            let _: &[u8] = &s[idx..idx + 2];
836        }
837    }
838
839    #[test]
840    fn mut_slice_indexing_is_mut_byte_slicing() {
841        let mut data = "a🦀b💎c".as_bytes().to_vec();
842        let s = Utf8Str::new_mut(&mut data);
843        // individual bytes can be copied out of the string ref.
844        for idx in 0..s.len() {
845            let cell: &mut u8 = &mut s[idx];
846            *cell = b'!';
847        }
848        assert_eq!(s, Utf8Str::new("!!!!!!!!!!!"));
849
850        // slicing in the middle of multi-byte UTF-8 characters is fine.
851        let s = Utf8Str::new_mut(&mut data);
852        for idx in 0..s.len() {
853            let span: &mut [u8] = &mut s[idx..=idx];
854            span.copy_from_slice(b"%");
855        }
856        assert_eq!(s, Utf8Str::new("%%%%%%%%%%%"));
857
858        let s = Utf8Str::new_mut(&mut data);
859        for idx in 0..s.len() - 1 {
860            let span: &mut [u8] = &mut s[idx..idx + 2];
861            span.copy_from_slice(b"^&");
862        }
863        assert_eq!(s, Utf8Str::new("^^^^^^^^^^&"));
864    }
865}
spinoso_string/enc/utf8/borrowed.rs

spinoso_string/enc/utf8/
borrowed.rs