spinoso_string/enc/utf8/borrowed.rs
1use alloc::boxed::Box;
2use core::ops::Range;
3use core::slice::SliceIndex;
4
5use bstr::ByteSlice;
6
7use crate::iter::{Bytes, Iter, IterMut};
8use crate::ord::OrdError;
9
10mod codepoints;
11mod eq;
12mod impls;
13
14pub use codepoints::Codepoints;
15
16#[repr(transparent)]
17pub struct Utf8Str {
18 bytes: [u8],
19}
20
21impl Utf8Str {
22 #[inline]
23 #[must_use]
24 pub fn new<B: ?Sized + AsRef<[u8]>>(bytes: &B) -> &Utf8Str {
25 Utf8Str::from_bytes(bytes.as_ref())
26 }
27
28 #[inline]
29 #[must_use]
30 pub(crate) fn new_mut<B: ?Sized + AsMut<[u8]>>(bytes: &mut B) -> &mut Utf8Str {
31 Utf8Str::from_bytes_mut(bytes.as_mut())
32 }
33
34 #[inline]
35 #[must_use]
36 pub const fn empty() -> &'static Utf8Str {
37 Utf8Str::from_bytes(b"")
38 }
39
40 #[inline]
41 #[must_use]
42 pub const fn from_bytes(slice: &[u8]) -> &Utf8Str {
43 // SAFETY: `Utf8Str` is a `repr(transparent)` wrapper around `[u8]`.
44 unsafe {
45 let ptr: *const [u8] = slice;
46 let ptr = ptr as *const Utf8Str;
47 &*ptr
48 }
49 }
50
51 #[inline]
52 #[must_use]
53 pub fn from_bytes_mut(slice: &mut [u8]) -> &mut Utf8Str {
54 // SAFETY: `Utf8Str` is a `repr(transparent)` wrapper around `[u8]`.
55 unsafe {
56 let ptr: *mut [u8] = slice;
57 let ptr = ptr as *mut Utf8Str;
58 &mut *ptr
59 }
60 }
61
62 #[inline]
63 pub fn from_boxed_bytes(slice: Box<[u8]>) -> Box<Utf8Str> {
64 // SAFETY: `Utf8Str` is a `repr(transparent)` wrapper around `[u8]`.
65 unsafe { Box::from_raw(Box::into_raw(slice) as _) }
66 }
67
68 #[inline]
69 pub fn into_boxed_bytes(slice: Box<Utf8Str>) -> Box<[u8]> {
70 // SAFETY: `Utf8Str` is a `repr(transparent)` wrapper around `[u8]`.
71 unsafe { Box::from_raw(Box::into_raw(slice) as _) }
72 }
73
74 #[inline]
75 #[must_use]
76 pub const fn as_bytes(&self) -> &[u8] {
77 &self.bytes
78 }
79
80 #[inline]
81 #[must_use]
82 pub fn as_bytes_mut(&mut self) -> &mut [u8] {
83 &mut self.bytes
84 }
85}
86
87// Raw
88impl Utf8Str {
89 #[inline]
90 #[must_use]
91 pub fn as_ptr(&self) -> *const u8 {
92 self.as_bytes().as_ptr()
93 }
94
95 #[inline]
96 #[must_use]
97 pub fn as_mut_ptr(&mut self) -> *mut u8 {
98 self.as_bytes_mut().as_mut_ptr()
99 }
100}
101
102// Core Iterators
103impl Utf8Str {
104 #[inline]
105 #[must_use]
106 pub fn iter(&self) -> Iter<'_> {
107 Iter::from_slice(self.as_bytes())
108 }
109
110 #[inline]
111 #[must_use]
112 pub fn iter_mut(&mut self) -> IterMut<'_> {
113 IterMut::from_mut_slice(self.as_bytes_mut())
114 }
115
116 #[inline]
117 #[must_use]
118 pub fn bytes(&self) -> Bytes<'_> {
119 Bytes::from_slice(self.as_bytes())
120 }
121}
122
123// Size and Capacity
124impl Utf8Str {
125 #[inline]
126 #[must_use]
127 pub fn len(&self) -> usize {
128 self.as_bytes().len()
129 }
130
131 #[inline]
132 #[must_use]
133 pub fn is_empty(&self) -> bool {
134 self.as_bytes().is_empty()
135 }
136}
137
138// Character-oriented APIs
139impl Utf8Str {
140 #[must_use]
141 pub fn chr(&self) -> &Utf8Str {
142 let slice = self.as_bytes();
143 let prefix = match bstr::decode_utf8(slice) {
144 (Some(_), size) => size,
145 (None, 0) => return Utf8Str::empty(),
146 (None, _) => 1,
147 };
148 // SAFETY: the UTF-8 decode above guarantees the prefix length is a
149 // valid slice index.
150 let s = unsafe { self.get_unchecked(..prefix) };
151 Utf8Str::from_bytes(s)
152 }
153
154 pub fn ord(&self) -> Result<u32, OrdError> {
155 let (ch, size) = bstr::decode_utf8(self.as_bytes());
156 match ch {
157 // All `char`s are valid `u32`s
158 Some(ch) => Ok(u32::from(ch)),
159 None if size == 0 => Err(OrdError::empty_string()),
160 None => Err(OrdError::invalid_utf8_byte_sequence()),
161 }
162 }
163
164 #[must_use]
165 pub fn char_len(&self) -> usize {
166 let bytes = self.as_bytes();
167
168 let tail = if let Some(idx) = bytes.find_non_ascii_byte() {
169 idx
170 } else {
171 // The entire string is ASCII bytes, so fast-path return the slice
172 // length.
173 return bytes.len();
174 };
175
176 // SAFETY: `ByteSlice::find_non_ascii_byte` guarantees that the index is
177 // in range for slicing if `Some(_)` is returned.
178 let bytes = unsafe { bytes.get_unchecked(tail..) };
179
180 // if the tail is valid UTF-8, use a fast path by delegating to SIMD
181 // `bytecount` crate.
182 if simdutf8::basic::from_utf8(bytes).is_ok() {
183 return tail + bytecount::num_chars(bytes);
184 }
185
186 // Else fallback to decoding UTF-8 in chunks using `bstr`.
187 let mut char_len = tail;
188 for chunk in bytes.utf8_chunks() {
189 char_len += bytecount::num_chars(chunk.valid().as_bytes());
190 char_len += chunk.invalid().len();
191 }
192 char_len
193 }
194
195 #[must_use]
196 pub fn get_char(&self, index: usize) -> Option<&Utf8Str> {
197 // Fast path rejection for indexes beyond bytesize, which is cheap to
198 // retrieve.
199 if index >= self.len() {
200 return None;
201 }
202
203 let slice = self.as_bytes();
204 // Fast path for trying to treat the conventionally UTF-8 string as
205 // entirely ASCII.
206 //
207 // If the string is either all ASCII or all ASCII for a prefix of the
208 // string that contains the range we wish to slice, use byte slicing
209 // like `AsciiStr` and `BinaryStr` do.
210 let consumed = match slice.find_non_ascii_byte() {
211 // The string is entirely ASCII, so we can always use byte slicing
212 // to mean char slicing.
213 None => {
214 let s = slice.get(index..=index)?;
215 return Some(Utf8Str::from_bytes(s));
216 }
217 // The first non-ASCII character occurs beyond the index we wish to
218 // retrieve, so we can use byte slicing to mean char slicing.
219 Some(idx) if idx > index => {
220 let s = slice.get(index..=index)?;
221 return Some(Utf8Str::from_bytes(s));
222 }
223 // The first `idx` characters of the `Utf8Str` end at the `idx` byte
224 // position.
225 Some(idx) => idx,
226 };
227
228 // Discard the ASCII prefix and begin a forward search with a character-
229 // at-a-time decode.
230 //
231 // SAFETY: `find_non_ascii_byte` guarantees that when `Some(idx)` is
232 // returned, `idx` is a valid position in the slice.
233 let mut slice = unsafe { slice.get_unchecked(consumed..) };
234 // Count of "characters" remaining until the `index`th character.
235 let mut remaining = index - consumed;
236
237 // This loop will terminate when either:
238 //
239 // - It counts `index` number of characters.
240 // - It consumes the entire slice when scanning for the `index`th
241 // character.
242 //
243 // The loop will advance by at least one byte every iteration.
244 loop {
245 match bstr::decode_utf8(slice) {
246 // `decode_utf8` only returns a 0 size when the slice is empty.
247 //
248 // If we've run out of slice while trying to find the `index`th
249 // character, the lookup fails and we return `nil`.
250 (_, 0) => return None,
251
252 // The next two arms mean we've reached the `index`th character.
253 // Either return the next valid UTF-8 character byte slice or,
254 // if the next bytes are an invalid UTF-8 sequence, the next byte.
255 (Some(_), size) if remaining == 0 => {
256 // SAFETY: `decode_utf8` guarantees that the number of bytes
257 // returned on a successful decode can be used to slice into
258 // the given slice.
259 let s = unsafe { slice.get_unchecked(..size) };
260 return Some(Utf8Str::from_bytes(s));
261 }
262 (None, _) if remaining == 0 => {
263 // SAFETY: `decode_utf8` guarantees unsuccessful decodes
264 // consume 0..=3 bytes and size is guaranteed to be non-zero
265 // per the first match arm.
266 let s = unsafe { slice.get_unchecked(..1) };
267 return Some(Utf8Str::from_bytes(s));
268 }
269
270 // We found a single UTF-8 encoded character keep track of the
271 // count and advance the substring to continue decoding.
272 (Some(_), size) => {
273 // SAFETY: `decode_utf8` guarantees that at least `size`
274 // bytes exist in the slice.
275 slice = unsafe { slice.get_unchecked(size..) };
276 remaining -= 1;
277 }
278
279 // The next two arms handle the case where we have encountered
280 // an invalid UTF-8 byte sequence.
281 //
282 // In this case, `decode_utf8` will return slices whose length
283 // is `1..=3`. The length of this slice is the number of
284 // "characters" we can advance the loop by.
285 //
286 // If the invalid UTF-8 sequence contains more bytes than we
287 // have remaining to get to the `index`th char, then the target
288 // character is inside the invalid UTF-8 sequence.
289 (None, size) if remaining < size => {
290 // SAFETY: `decode_utf8` guarantees that at least `size`
291 // bytes exist in the slice and we check that `remaining` is
292 // less than `size`.
293 let s = unsafe { slice.get_unchecked(remaining..=remaining) };
294 return Some(Utf8Str::from_bytes(s));
295 }
296 // If there are more characters remaining than the number of
297 // bytes yielded in the invalid UTF-8 byte sequence, count
298 // `size` bytes and advance the slice to continue decoding.
299 (None, size) => {
300 // SAFETY: `decode_utf8` guarantees that at least `size`
301 // bytes exist in the slice.
302 slice = unsafe { slice.get_unchecked(size..) };
303 remaining -= size;
304 }
305 }
306 }
307 }
308
309 #[must_use]
310 pub fn get_char_slice(&self, range: Range<usize>) -> Option<&Utf8Str> {
311 let Range { start, end } = range;
312
313 // Fast path the lookup if the end of the range is before the start.
314 if end < start {
315 // Yes, these types of ranges are allowed and they return `""`.
316 //
317 // ```
318 // [3.0.1] > "aaa"[1..0]
319 // => ""
320 // [3.0.1] > "aaa"[2..0]
321 // => ""
322 // [3.0.1] > "aaa"[2..1]
323 // => ""
324 // [3.0.1] > "💎🦀😅"[2..1]
325 // => ""
326 // [3.0.1] > "💎🦀😅"[3..0]
327 // => ""
328 // ```
329 //
330 // but only if `start` is within the string.
331 //
332 // ```
333 // [3.0.1] > "aaa"[10..4]
334 // => nil
335 // [3.0.1] > "aaa"[10..0]
336 // => nil
337 // [3.0.1] > "💎🦀😅"[10..4]
338 // => nil
339 // [3.0.1] > "💎🦀😅"[10..0]
340 // => nil
341 // [3.0.1] > "💎🦀😅"[6..0]
342 // => nil
343 // [3.0.1] > "💎🦀😅"[4..0]
344 // => nil
345 // ```
346 //
347 // attempt to short-circuit with a cheap length retrieval
348 if start > self.len() || start > self.char_len() {
349 return None;
350 }
351 return Some(Utf8Str::empty());
352 }
353
354 // If the start of the range is beyond the character count of the
355 // string, the whole lookup must fail.
356 //
357 // Slice lookups where the start is just beyond the last character index
358 // always return an empty slice.
359 //
360 // ```
361 // [3.0.1] > "aaa"[10, 0]
362 // => nil
363 // [3.0.1] > "aaa"[10, 7]
364 // => nil
365 // [3.0.1] > "aaa"[3, 7]
366 // => ""
367 // [3.0.1] > "🦀💎"[2, 0]
368 // => ""
369 // [3.0.1] > "🦀💎"[3, 1]
370 // => nil
371 // [3.0.1] > "🦀💎"[2, 1]
372 // => ""
373 // ```
374 //
375 // Fast path rejection for indexes beyond bytesize, which is cheap to
376 // retrieve.
377 if start > self.len() {
378 return None;
379 }
380 match self.char_len() {
381 char_length if start > char_length => return None,
382 char_length if start == char_length => return Some(Utf8Str::empty()),
383 _ => {}
384 }
385
386 // The span is guaranteed to at least partially overlap now.
387 match end - start {
388 // Empty substrings are present in all strings, even empty ones.
389 //
390 // ```
391 // [3.0.1] > "aaa"[""]
392 // => ""
393 // [3.0.1] > ""[""]
394 // => ""
395 // [3.0.1] > ""[0, 0]
396 // => ""
397 // [3.0.1] > "aaa"[0, 0]
398 // => ""
399 // [3.0.1] > "aaa"[2, 0]
400 // => ""
401 // [3.0.1] > "🦀💎"[1, 0]
402 // => ""
403 // [3.0.1] > "🦀💎"[2, 0]
404 // => ""
405 // ```
406 0 => return Some(Utf8Str::empty()),
407 // Delegate to the specialized single char lookup, which allows the
408 // remainder of this routine to fall back to the general case of
409 // multi-character spans.
410 //
411 // ```
412 // [3.0.1] > "abc"[2, 1]
413 // => "c"
414 // [3.0.1] > "🦀💎"[1, 1]
415 // => "💎"
416 // ```
417 1 => return self.get_char(start),
418 _ => {}
419 }
420
421 let slice = self.as_bytes();
422
423 // Fast path for trying to treat the conventionally UTF-8 string
424 // as entirely ASCII.
425 //
426 // If the string is either all ASCII or all ASCII for the subset
427 // of the string we wish to slice, fallback to byte slicing as in
428 // the ASCII and binary fast path.
429 //
430 // Perform the same saturate-to-end slicing mechanism if `end`
431 // is beyond the character length of the string.
432 let consumed = match slice.find_non_ascii_byte() {
433 // The entire string is ASCII, so byte indexing <=> char
434 // indexing.
435 None => {
436 let s = slice.get(start..end).or_else(|| slice.get(start..))?;
437 return Some(Utf8Str::from_bytes(s));
438 }
439 // The whole substring we are interested in is ASCII, so
440 // byte indexing is still valid.
441 Some(non_ascii_byte_offset) if non_ascii_byte_offset > end => {
442 let s = self.get(start..end)?;
443 return Some(Utf8Str::from_bytes(s));
444 }
445 // We turn non-ASCII somewhere inside before the substring
446 // we're interested in, so consume that much.
447 Some(non_ascii_byte_offset) if non_ascii_byte_offset <= start => non_ascii_byte_offset,
448 // This means we turn non-ASCII somewhere inside the substring.
449 // Consume up to start.
450 Some(_) => start,
451 };
452
453 // Scan for the beginning of the slice
454 let mut slice = &slice[consumed..];
455 // Count of "characters" remaining until the `start`th character.
456 let mut remaining = start - consumed;
457
458 if remaining > 0 {
459 // This loop will terminate when either:
460 //
461 // - It counts `start` number of characters.
462 // - It consumes the entire slice when scanning for the
463 // `start`th character.
464 //
465 // The loop will advance by at least one byte every iteration.
466 loop {
467 match bstr::decode_utf8(slice) {
468 // If we've run out of slice while trying to find the
469 // `start`th character, the lookup fails and we return `nil`.
470 (_, 0) => return None,
471
472 // We found a single UTF-8 encoded character. keep track
473 // of the count and advance the substring to continue
474 // decoding.
475 //
476 // If there's only one more to go, advance and stop the
477 // loop.
478 (Some(_), size) if remaining == 1 => {
479 slice = &slice[size..];
480 break;
481 }
482 // Otherwise, keep track of the character we observed and
483 // advance the slice to continue decoding.
484 (Some(_), size) => {
485 slice = &slice[size..];
486 remaining -= 1;
487 }
488
489 // The next two arms handle the case where we have
490 // encountered an invalid UTF-8 byte sequence.
491 //
492 // In this case, `decode_utf8` will return slices whose
493 // length is `1..=3`. The length of this slice is the
494 // number of "characters" we can advance the loop by.
495 //
496 // If the invalid UTF-8 sequence contains more bytes
497 // than we have remaining to get to the `start`th char,
498 // then we can break the loop directly.
499 (None, size) if remaining <= size => {
500 slice = &slice[remaining..];
501 break;
502 }
503 // If there are more characters remaining than the number
504 // of bytes yielded in the invalid UTF-8 byte sequence,
505 // count `size` bytes and advance the slice to continue
506 // decoding.
507 (None, size) => {
508 slice = &slice[size..];
509 remaining -= size;
510 }
511 }
512 }
513 };
514
515 // Scan the slice for the span of characters we want to return.
516 remaining = end - start;
517 // We know `remaining` is not zero because we fast-pathed that
518 // case above.
519 debug_assert!(remaining > 0);
520
521 // keep track of the start of the substring from the `start`th
522 // character.
523 let substr = slice;
524
525 // This loop will terminate when either:
526 //
527 // - It counts the next `start - end` number of characters.
528 // - It consumes the entire slice when scanning for the `end`th
529 // character.
530 //
531 // The loop will advance by at least one byte every iteration.
532 loop {
533 match bstr::decode_utf8(slice) {
534 // If we've run out of slice while trying to find the `end`th
535 // character, saturate the slice to the end of the string.
536 (_, 0) => return Some(Utf8Str::from_bytes(substr)),
537
538 // We found a single UTF-8 encoded character. keep track
539 // of the count and advance the substring to continue
540 // decoding.
541 //
542 // If there's only one more to go, advance and stop the
543 // loop.
544 (Some(_), size) if remaining == 1 => {
545 // Push `endth` more positive because this match has
546 // the effect of shrinking `slice`.
547 let endth = substr.len() - slice.len() + size;
548 let s = &substr[..endth];
549 return Some(Utf8Str::from_bytes(s));
550 }
551 // Otherwise, keep track of the character we observed and
552 // advance the slice to continue decoding.
553 (Some(_), size) => {
554 slice = &slice[size..];
555 remaining -= 1;
556 }
557
558 // The next two arms handle the case where we have
559 // encountered an invalid UTF-8 byte sequence.
560 //
561 // In this case, `decode_utf8` will return slices whose
562 // length is `1..=3`. The length of this slice is the
563 // number of "characters" we can advance the loop by.
564 //
565 // If the invalid UTF-8 sequence contains more bytes
566 // than we have remaining to get to the `end`th char,
567 // then we can break the loop directly.
568 (None, size) if remaining <= size => {
569 // For an explanation of this arithmetic:
570 // If we're trying to slice:
571 //
572 // ```
573 // s = "a\xF0\x9F\x87"
574 // s[0, 2]
575 // ```
576 //
577 // By the time we get to this branch in this loop:
578 //
579 // ```
580 // substr = "a\xF0\x9F\x87"
581 // slice = "\xF0\x9F\x87"
582 // remaining = 1
583 // ```
584 //
585 // We want to compute `endth == 2`:
586 //
587 // 2 = 4 - 3 + 1
588 let endth = substr.len() - slice.len() + remaining;
589 let s = &substr[..endth];
590 return Some(Utf8Str::from_bytes(s));
591 }
592 // If there are more characters remaining than the number
593 // of bytes yielded in the invalid UTF-8 byte sequence,
594 // count `size` bytes and advance the slice to continue
595 // decoding.
596 (None, size) => {
597 slice = &slice[size..];
598 remaining -= size;
599 }
600 }
601 }
602 }
603}
604
605// Indexing
606impl Utf8Str {
607 #[inline]
608 #[must_use]
609 pub fn get<I>(&self, index: I) -> Option<&I::Output>
610 where
611 I: SliceIndex<[u8]>,
612 {
613 self.as_bytes().get(index)
614 }
615
616 #[inline]
617 #[must_use]
618 pub fn get_mut<I>(&mut self, index: I) -> Option<&mut I::Output>
619 where
620 I: SliceIndex<[u8]>,
621 {
622 self.as_bytes_mut().get_mut(index)
623 }
624
625 #[inline]
626 #[must_use]
627 pub unsafe fn get_unchecked<I>(&self, index: I) -> &I::Output
628 where
629 I: SliceIndex<[u8]>,
630 {
631 // SAFETY: The caller must uphold the documented safety contract, which
632 // is the same as the borrowed UTF-8 str's inner slice.
633 unsafe { self.as_bytes().get_unchecked(index) }
634 }
635
636 #[inline]
637 #[must_use]
638 pub unsafe fn get_unchecked_mut<I>(&mut self, index: I) -> &mut I::Output
639 where
640 I: SliceIndex<[u8]>,
641 {
642 // SAFETY: The caller must uphold the documented safety contract, which
643 // is the same as the borrowed UTF-8 str's inner slice.
644 unsafe { self.as_bytes_mut().get_unchecked_mut(index) }
645 }
646}
647
648// Encoding
649impl Utf8Str {
650 #[must_use]
651 pub fn is_ascii_only(&self) -> bool {
652 self.as_bytes().is_ascii()
653 }
654
655 #[must_use]
656 pub fn is_valid_encoding(&self) -> bool {
657 if self.is_ascii_only() {
658 return true;
659 }
660
661 simdutf8::basic::from_utf8(self.as_bytes()).is_ok()
662 }
663}
664
665// Slicing routines
666impl Utf8Str {
667 #[inline]
668 #[must_use]
669 pub fn starts_with(&self, slice: &[u8]) -> bool {
670 self.as_bytes().starts_with(slice)
671 }
672
673 #[inline]
674 #[must_use]
675 pub fn ends_with(&self, slice: &[u8]) -> bool {
676 self.as_bytes().ends_with(slice)
677 }
678}
679
680// Searching routines
681impl Utf8Str {
682 #[must_use]
683 pub fn index(&self, needle: &[u8], offset: usize) -> Option<usize> {
684 // Decode needle
685 // Needle containing any invalid UTF-8 should never match in MRI
686 //
687 // ```console
688 // [3.2.2] > s = "abc"
689 // => "abc"
690 // [3.2.2] > s.encoding
691 // => #<Encoding:UTF-8>
692 // [3.2.2] > s.index "\xFF"
693 // => nil
694 // [3.2.2] > s = "\xFF\xFE"
695 // => "\xFF\xFE"
696 // [3.2.2] > s.encoding
697 // => #<Encoding:UTF-8>
698 // [3.2.2] > s.index "\xFF"
699 // => nil
700 // [3.2.2] > s.index "\xFF".b
701 // (irb):14:in `index': incompatible character encodings: UTF-8 and ASCII-8BIT (Encoding::CompatibilityError)
702 // from (irb):14:in `<main>'
703 // from /usr/local/var/rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/irb-1.6.2/exe/irb:11:in `<top (required)>'
704 // from /usr/local/var/rbenv/versions/3.2.2/bin/irb:25:in `load'
705 // from /usr/local/var/rbenv/versions/3.2.2/bin/irb:25:in `<main>'
706 // ```
707 if !Utf8Str::from_bytes(needle).is_valid_encoding() {
708 return None;
709 }
710
711 let prefix = self.get_char_slice(0..offset)?;
712 let tail = &self[prefix.len()..];
713 let index = tail.as_bytes().find(needle)?;
714
715 let s = Utf8Str::from_bytes(&tail[..index]);
716 Some(offset + s.char_len())
717 }
718
719 #[must_use]
720 pub fn rindex(&self, needle: &[u8], offset: usize) -> Option<usize> {
721 // Decode needle
722 // Needle containing any invalid UTF-8 should never match in MRI
723 //
724 // ```console
725 // [3.2.2] > s = "abc"
726 // => "abc"
727 // [3.2.2] > s.encoding
728 // => #<Encoding:UTF-8>
729 // [3.2.2] > s.rindex "\xFF"
730 // => nil
731 // [3.2.2] > s = "\xFF\xFE"
732 // => "\xFF\xFE"
733 // [3.2.2] > s.encoding
734 // => #<Encoding:UTF-8>
735 // [3.2.2] > s.rindex "\xFF"
736 // => nil
737 // [3.2.2] > s.rindex "\xFF".b
738 // (irb):7:in `rindex': incompatible character encodings: UTF-8 and ASCII-8BIT (Encoding::CompatibilityError)
739 // from (irb):7:in `<main>'
740 // from /usr/local/var/rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/irb-1.6.2/exe/irb:11:in `<top (required)>'
741 // from /usr/local/var/rbenv/versions/3.2.2/bin/irb:25:in `load'
742 // from /usr/local/var/rbenv/versions/3.2.2/bin/irb:25:in `<main>'
743 // ```
744 if !needle.is_utf8() {
745 return None;
746 }
747
748 let endpoint = offset.saturating_add(1);
749 let buf = self.get_char_slice(0..endpoint).unwrap_or(self);
750 let index = buf.as_bytes().rfind(needle)?;
751 let s = Utf8Str::from_bytes(&buf[..index]);
752 Some(s.char_len())
753 }
754}
755
756#[cfg(test)]
757mod tests {
758 use alloc::string::String;
759 use core::fmt::Write;
760
761 use super::Utf8Str;
762
763 #[test]
764 fn empty_is_empty() {
765 let s = Utf8Str::empty();
766 assert_eq!(s.len(), 0);
767 assert_eq!(s.as_bytes(), &[]);
768 }
769
770 #[test]
771 fn default_is_empty() {
772 assert_eq!(Utf8Str::empty(), <&Utf8Str>::default());
773 }
774
775 #[test]
776 fn debug_is_not_empty() {
777 let s = Utf8Str::empty();
778 let mut buf = String::new();
779 write!(&mut buf, "{s:?}").unwrap();
780 assert!(!buf.is_empty());
781
782 let s = Utf8Str::new("abc");
783 let mut buf = String::new();
784 write!(&mut buf, "{s:?}").unwrap();
785 assert!(!buf.is_empty());
786 assert!(buf.contains(r#""abc""#));
787
788 let s = Utf8Str::new("🦀💎");
789 let mut buf = String::new();
790 write!(&mut buf, "{s:?}").unwrap();
791 assert!(!buf.is_empty());
792
793 let s = Utf8Str::new(b"\xFF\xFE");
794 let mut buf = String::new();
795 write!(&mut buf, "{s:?}").unwrap();
796 assert!(!buf.is_empty());
797 }
798
799 #[test]
800 fn debug_contains_readable_byte_contents() {
801 let s = Utf8Str::empty();
802 let mut buf = String::new();
803 write!(&mut buf, "{s:?}").unwrap();
804 assert!(buf.contains(r#""""#));
805
806 let s = Utf8Str::new("abc");
807 let mut buf = String::new();
808 write!(&mut buf, "{s:?}").unwrap();
809 assert!(buf.contains(r#""abc""#));
810
811 let s = Utf8Str::new("🦀💎");
812 let mut buf = String::new();
813 write!(&mut buf, "{s:?}").unwrap();
814 assert!(buf.contains(r#""🦀💎""#));
815
816 let s = Utf8Str::new(b"\xFF\xFE");
817 let mut buf = String::new();
818 write!(&mut buf, "{s:?}").unwrap();
819 assert!(buf.contains(r#""\xff\xfe""#));
820 }
821
822 #[test]
823 fn slice_indexing_is_byte_slicing() {
824 let s = Utf8Str::new("a🦀b💎c");
825 // individual bytes can be copied out of the string ref.
826 for idx in 0..s.len() {
827 let _: u8 = s[idx];
828 }
829
830 // slicing in the middle of multi-byte UTF-8 characters is fine.
831 for idx in 0..s.len() {
832 let _: &[u8] = &s[idx..=idx];
833 }
834 for idx in 0..s.len() - 1 {
835 let _: &[u8] = &s[idx..idx + 2];
836 }
837 }
838
839 #[test]
840 fn mut_slice_indexing_is_mut_byte_slicing() {
841 let mut data = "a🦀b💎c".as_bytes().to_vec();
842 let s = Utf8Str::new_mut(&mut data);
843 // individual bytes can be copied out of the string ref.
844 for idx in 0..s.len() {
845 let cell: &mut u8 = &mut s[idx];
846 *cell = b'!';
847 }
848 assert_eq!(s, Utf8Str::new("!!!!!!!!!!!"));
849
850 // slicing in the middle of multi-byte UTF-8 characters is fine.
851 let s = Utf8Str::new_mut(&mut data);
852 for idx in 0..s.len() {
853 let span: &mut [u8] = &mut s[idx..=idx];
854 span.copy_from_slice(b"%");
855 }
856 assert_eq!(s, Utf8Str::new("%%%%%%%%%%%"));
857
858 let s = Utf8Str::new_mut(&mut data);
859 for idx in 0..s.len() - 1 {
860 let span: &mut [u8] = &mut s[idx..idx + 2];
861 span.copy_from_slice(b"^&");
862 }
863 assert_eq!(s, Utf8Str::new("^^^^^^^^^^&"));
864 }
865}