spinoso_string/enc/utf8/
mod.rs

1macro_rules! impl_partial_eq {
2    ($lhs:ty, $rhs:ty) => {
3        impl<'a, 'b> PartialEq<$rhs> for $lhs {
4            #[inline]
5            fn eq(&self, other: &$rhs) -> bool {
6                let other: &[u8] = other.as_ref();
7                PartialEq::eq(self.as_bytes(), other)
8            }
9        }
10
11        impl<'a, 'b> PartialEq<$lhs> for $rhs {
12            #[inline]
13            fn eq(&self, other: &$lhs) -> bool {
14                let this: &[u8] = self.as_ref();
15                PartialEq::eq(this, other.as_bytes())
16            }
17        }
18    };
19}
20
21macro_rules! impl_partial_eq_array {
22    ($lhs:ty, $rhs:ty) => {
23        impl<'a, 'b, const N: usize> PartialEq<$rhs> for $lhs {
24            #[inline]
25            fn eq(&self, other: &$rhs) -> bool {
26                let other: &[u8] = other.as_ref();
27                PartialEq::eq(self.as_bytes(), other)
28            }
29        }
30
31        impl<'a, 'b, const N: usize> PartialEq<$lhs> for $rhs {
32            #[inline]
33            fn eq(&self, other: &$lhs) -> bool {
34                let this: &[u8] = self.as_ref();
35                PartialEq::eq(this, other.as_bytes())
36            }
37        }
38    };
39}
40
41mod borrowed;
42mod case_change;
43mod inspect;
44mod owned;
45
46pub use borrowed::Codepoints;
47pub use borrowed::Utf8Str;
48pub use inspect::Inspect;
49pub use owned::Utf8String;
50
51#[cfg(test)]
52#[expect(clippy::invisible_characters, reason = "testing naughty UTF-8 strings")]
53mod tests {
54    use alloc::string::String;
55    use alloc::vec::Vec;
56    use core::str;
57
58    use super::{Utf8Str, Utf8String};
59    use crate::test::run_arbitrary;
60
61    const REPLACEMENT_CHARACTER_BYTES: [u8; 3] = [239, 191, 189];
62
63    #[test]
64    fn prop_fuzz_char_len_utf8_contents_utf8_string() {
65        run_arbitrary::<String>(|contents| {
66            let expected = contents.chars().count();
67            let s = Utf8String::from(contents);
68            assert_eq!(s.char_len(), expected);
69        });
70    }
71
72    #[test]
73    fn prop_fuzz_len_utf8_contents_utf8_string() {
74        run_arbitrary::<String>(|contents| {
75            let expected = contents.len();
76            let s = Utf8String::from(contents);
77            assert_eq!(s.len(), expected);
78        });
79    }
80
81    #[test]
82    fn prop_fuzz_char_len_binary_contents_utf8_string() {
83        run_arbitrary::<Vec<u8>>(|contents| {
84            if let Ok(utf8_contents) = str::from_utf8(&contents) {
85                let expected = utf8_contents.chars().count();
86                let s = Utf8String::from(contents);
87                assert_eq!(s.char_len(), expected);
88            } else {
89                let expected_at_most = contents.len();
90                let s = Utf8String::from(contents);
91                assert!(s.char_len() <= expected_at_most);
92            }
93        });
94    }
95
96    #[test]
97    fn prop_fuzz_len_binary_contents_utf8_string() {
98        run_arbitrary::<Vec<u8>>(|contents| {
99            let expected = contents.len();
100            let s = Utf8String::from(contents);
101            assert_eq!(s.len(), expected);
102        });
103    }
104
105    #[test]
106    fn constructs_empty_buffer() {
107        let s = Utf8String::from(Vec::new());
108        assert_eq!(0, s.len());
109    }
110
111    #[test]
112    fn char_len_empty() {
113        let s = Utf8String::from("");
114        assert_eq!(s.char_len(), 0);
115    }
116
117    #[test]
118    fn char_len_ascii() {
119        let s = Utf8String::from("Artichoke Ruby");
120        assert_eq!(s.char_len(), 14);
121    }
122
123    #[test]
124    fn char_len_emoji() {
125        let s = Utf8String::from("๐Ÿ’Ž");
126        assert_eq!(s.char_len(), 1);
127        let s = Utf8String::from("๐Ÿ’Ž๐Ÿฆ€๐ŸŽ‰");
128        assert_eq!(s.char_len(), 3);
129        let s = Utf8String::from("a๐Ÿ’Žb๐Ÿฆ€c๐ŸŽ‰d");
130        assert_eq!(s.char_len(), 7);
131        // with invalid UTF-8 bytes
132        let s = Utf8String::from(b"a\xF0\x9F\x92\x8E\xFFabc");
133        assert_eq!(s.char_len(), 6);
134    }
135
136    #[test]
137    fn char_len_unicode_replacement_character() {
138        let s = Utf8String::from("๏ฟฝ");
139        assert_eq!(s.char_len(), 1);
140        let s = Utf8String::from("๏ฟฝ๏ฟฝ๏ฟฝ");
141        assert_eq!(s.char_len(), 3);
142        let s = Utf8String::from("a๏ฟฝb๏ฟฝc๏ฟฝd");
143        assert_eq!(s.char_len(), 7);
144        let s = Utf8String::from("๏ฟฝ๐Ÿ’Žb๐Ÿฆ€c๐ŸŽ‰๏ฟฝ");
145        assert_eq!(s.char_len(), 7);
146        // with invalid UFF-8 bytes
147        let s = Utf8String::from(b"\xEF\xBF\xBD\xF0\x9F\x92\x8E\xFF\xEF\xBF\xBDab");
148        assert_eq!(s.char_len(), 6);
149        let s = Utf8String::from(REPLACEMENT_CHARACTER_BYTES);
150        assert_eq!(s.char_len(), 1);
151    }
152
153    #[test]
154    fn char_len_nul_byte() {
155        let s = Utf8String::from(b"\x00");
156        assert_eq!(s.char_len(), 1);
157        let s = Utf8String::from(b"abc\x00");
158        assert_eq!(s.char_len(), 4);
159        let s = Utf8String::from(b"abc\x00xyz");
160        assert_eq!(s.char_len(), 7);
161    }
162
163    #[test]
164    fn char_len_invalid_utf8_byte_sequences() {
165        let s = Utf8String::from(b"\x00\x00\xD8\x00");
166        assert_eq!(s.char_len(), 4);
167        let s = Utf8String::from(b"\xFF\xFE");
168        assert_eq!(s.char_len(), 2);
169    }
170
171    #[test]
172    fn char_len_binary() {
173        let bytes = &[
174            0xB3, 0x7E, 0x39, 0x70, 0x8E, 0xFD, 0xBB, 0x75, 0x62, 0x77, 0xE7, 0xDF, 0x6F, 0xF2, 0x76, 0x27, 0x81,
175            0x9A, 0x3A, 0x9D, 0xED, 0x6B, 0x4F, 0xAE, 0xC4, 0xE7, 0xA1, 0x66, 0x11, 0xF1, 0x08, 0x1C,
176        ];
177        let s = Utf8String::from(bytes);
178        assert_eq!(s.char_len(), 32);
179        // Mixed binary and ASCII
180        let bytes = &[
181            b'?', b'!', b'a', b'b', b'c', 0xFD, 0xBB, 0x75, 0x62, 0x77, 0xE7, 0xDF, 0x6F, 0xF2, 0x76, 0x27, 0x81,
182            0x9A, 0x3A, 0x9D, 0xED, 0x6B, 0x4F, 0xAE, 0xC4, 0xE7, 0xA1, 0x66, 0x11, 0xF1, 0x08, 0x1C,
183        ];
184        let s = Utf8String::from(bytes);
185        assert_eq!(s.char_len(), 32);
186    }
187
188    #[test]
189    fn char_len_mixed_ascii_emoji_invalid_bytes() {
190        // ```
191        // [2.6.3] > s = "๐Ÿฆ€abc๐Ÿ’Ž\xff"
192        // => "๐Ÿฆ€abc๐Ÿ’Ž\xFF"
193        // [2.6.3] > s.length
194        // => 6
195        // [2.6.3] > puts s.bytes.map{|b| "\\x#{b.to_s(16).upcase}"}.join
196        // \xF0\x9F\xA6\x80\x61\x62\x63\xF0\x9F\x92\x8E\xFF
197        // ```
198        let s = Utf8String::from(b"\xF0\x9F\xA6\x80\x61\x62\x63\xF0\x9F\x92\x8E\xFF");
199        assert_eq!(s.char_len(), 6);
200    }
201
202    #[test]
203    fn char_len_utf8() {
204        // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L147-L157
205        let s = Utf8String::from("ฮฉโ‰ˆรงโˆšโˆซหœยตโ‰คโ‰ฅรท");
206        assert_eq!(s.char_len(), 10);
207        let s = Utf8String::from("รฅรŸโˆ‚ฦ’ยฉห™โˆ†หšยฌโ€ฆรฆ");
208        assert_eq!(s.char_len(), 11);
209        let s = Utf8String::from("ล“โˆ‘ยดยฎโ€ ยฅยจห†รธฯ€โ€œโ€˜");
210        assert_eq!(s.char_len(), 12);
211        let s = Utf8String::from("ยกโ„ขยฃยขโˆžยงยถโ€ขยชยบโ€“โ‰ ");
212        assert_eq!(s.char_len(), 12);
213        let s = Utf8String::from("ยธห›ร‡โ—Šฤฑหœร‚ยฏห˜ยฟ");
214        assert_eq!(s.char_len(), 10);
215        let s = Utf8String::from("ร…รรŽรหร“ร”๏ฃฟร’รšร†โ˜ƒ");
216        assert_eq!(s.char_len(), 12);
217        let s = Utf8String::from("ล’โ€žยดโ€ฐห‡รยจห†ร˜โˆโ€โ€™");
218        assert_eq!(s.char_len(), 12);
219        let s = Utf8String::from("`โ„โ‚ฌโ€นโ€บ๏ฌ๏ฌ‚โ€กยฐยทโ€šโ€”ยฑ");
220        assert_eq!(s.char_len(), 13);
221        let s = Utf8String::from("โ…›โ…œโ…โ…ž");
222        assert_eq!(s.char_len(), 4);
223        let s = Utf8String::from("ะะ‚ะƒะ„ะ…ะ†ะ‡ะˆะ‰ะŠะ‹ะŒะะŽะะะ‘ะ’ะ“ะ”ะ•ะ–ะ—ะ˜ะ™ะšะ›ะœะะžะŸะ ะกะขะฃะคะฅะฆะงะจะฉะชะซะฌะญะฎะฏะฐะฑะฒะณะดะตะถะทะธะนะบะปะผะฝะพะฟั€ัั‚ัƒั„ั…ั†ั‡ัˆั‰ัŠั‹ัŒััŽั");
224        assert_eq!(s.char_len(), 79);
225    }
226
227    #[test]
228    fn char_len_vmware_super_string() {
229        // A super string recommended by VMware Inc. Globalization Team: can
230        // effectively cause rendering issues or character-length issues to
231        // validate product globalization readiness.
232        //
233        // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L202-L224
234        let s = Utf8String::from("่กจใƒใ‚A้ท—ล’รฉ๏ผข้€รœรŸยชฤ…รฑไธ‚ใ€๐ €€");
235        assert_eq!(s.char_len(), 17);
236    }
237
238    #[test]
239    fn char_len_two_byte_chars() {
240        // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L188-L196
241        let s = Utf8String::from("็”ฐไธญใ•ใ‚“ใซใ‚ใ’ใฆไธ‹ใ•ใ„");
242        assert_eq!(s.char_len(), 11);
243        let s = Utf8String::from("ใƒ‘ใƒผใƒ†ใ‚ฃใƒผใธ่กŒใ‹ใชใ„ใ‹");
244        assert_eq!(s.char_len(), 11);
245        let s = Utf8String::from("ๅ’Œ่ฃฝๆผข่ชž");
246        assert_eq!(s.char_len(), 4);
247        let s = Utf8String::from("้ƒจ่ฝๆ ผ");
248        assert_eq!(s.char_len(), 3);
249        let s = Utf8String::from("์‚ฌํšŒ๊ณผํ•™์› ์–ดํ•™์—ฐ๊ตฌ์†Œ");
250        assert_eq!(s.char_len(), 11);
251        let s = Utf8String::from("์ฐฆ์ฐจ๋ฅผ ํƒ€๊ณ  ์˜จ ํŽฒ์‹œ๋งจ๊ณผ ์‘›๋‹ค๋ฆฌ ๋˜ ๋ฐฉ๊ฐํ•˜");
252        assert_eq!(s.char_len(), 22);
253        let s = Utf8String::from("็คพๆœƒ็ง‘ๅญธ้™ข่ชžๅญธ็ ”็ฉถๆ‰€");
254        assert_eq!(s.char_len(), 10);
255        let s = Utf8String::from("์šธ๋ž€๋ฐ”ํ† ๋ฅด");
256        assert_eq!(s.char_len(), 5);
257        let s = Utf8String::from("๐ œŽ๐ œฑ๐ น๐ ฑ“๐ ฑธ๐ ฒ–๐ ณ");
258        assert_eq!(s.char_len(), 7);
259    }
260
261    #[test]
262    fn char_len_space_chars() {
263        // Whitespace: all the characters with category `Zs`, `Zl`, or `Zp` (in Unicode
264        // version 8.0.0), plus `U+0009 (HT)`, `U+000B (VT)`, `U+000C (FF)`, `U+0085 (NEL)`,
265        // and `U+200B` (ZERO WIDTH SPACE), which are in the C categories but are often
266        // treated as whitespace in some contexts.
267        //
268        // This file unfortunately cannot express strings containing
269        // `U+0000`, `U+000A`, or `U+000D` (`NUL`, `LF`, `CR`).
270        //
271        // The next line may appear to be blank or mojibake in some viewers.
272        //
273        // The next line may be flagged for "trailing whitespace" in some viewers.
274        //
275        // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L131
276        let bytes = "	 ย… แš€โ€‚โ€ƒโ€‚โ€ƒโ€„โ€…โ€†โ€‡โ€ˆโ€‰โ€Šโ€‹โ€จโ€ฉโ€ฏโŸใ€€
277";
278        let s = Utf8String::from(bytes);
279        assert_eq!(s.char_len(), 25);
280    }
281
282    #[test]
283    fn casing_utf8_string_empty() {
284        let mut s = Utf8String::from(b"");
285
286        s.make_capitalized();
287        assert_eq!(s, "");
288
289        s.make_lowercase();
290        assert_eq!(s, "");
291
292        s.make_uppercase();
293        assert_eq!(s, "");
294
295        s.make_swapcase();
296        assert_eq!(s, "");
297    }
298
299    #[test]
300    fn casing_utf8_string_ascii() {
301        let lower = Utf8String::from(b"abc");
302        let mid_upper = Utf8String::from(b"aBc");
303        let upper = Utf8String::from(b"ABC");
304        let long = Utf8String::from(b"aBC, 123, ABC, baby you and me girl");
305
306        let capitalize: fn(&Utf8String) -> Utf8String = |value: &Utf8String| {
307            let mut value = value.clone();
308            value.make_capitalized();
309            value
310        };
311        let lowercase: fn(&Utf8String) -> Utf8String = |value: &Utf8String| {
312            let mut value = value.clone();
313            value.make_lowercase();
314            value
315        };
316        let uppercase: fn(&Utf8String) -> Utf8String = |value: &Utf8String| {
317            let mut value = value.clone();
318            value.make_uppercase();
319            value
320        };
321        let swapcase: fn(&Utf8String) -> Utf8String = |value: &Utf8String| {
322            let mut value = value.clone();
323            value.make_swapcase();
324            value
325        };
326
327        assert_eq!(capitalize(&lower), "Abc");
328        assert_eq!(capitalize(&mid_upper), "Abc");
329        assert_eq!(capitalize(&upper), "Abc");
330        assert_eq!(capitalize(&long), "Abc, 123, abc, baby you and me girl");
331
332        assert_eq!(lowercase(&lower), "abc");
333        assert_eq!(lowercase(&mid_upper), "abc");
334        assert_eq!(lowercase(&upper), "abc");
335        assert_eq!(lowercase(&long), "abc, 123, abc, baby you and me girl");
336
337        assert_eq!(uppercase(&lower), "ABC");
338        assert_eq!(uppercase(&mid_upper), "ABC");
339        assert_eq!(uppercase(&upper), "ABC");
340        assert_eq!(uppercase(&long), "ABC, 123, ABC, BABY YOU AND ME GIRL");
341
342        assert_eq!(swapcase(&lower), "ABC");
343        assert_eq!(swapcase(&mid_upper), "AbC");
344        assert_eq!(swapcase(&upper), "abc");
345        assert_eq!(swapcase(&long), "Abc, 123, abc, BABY YOU AND ME GIRL");
346    }
347
348    #[test]
349    fn casing_utf8_string_utf8() {
350        // Capitalization of `รŸ` (SS) differs from MRI:
351        //
352        // ```console
353        // [2.6.3] > "รŸ".capitalize
354        // => "Ss"
355        // ```
356        let sharp_s = Utf8String::from("รŸ");
357        let tomorrow = Utf8String::from("ฮฑฯฯฮนฮฟ");
358        let year = Utf8String::from("ฮญฯ„ฮฟฯ‚");
359        // two-byte characters
360        // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L198-L200
361        let two_byte_chars = Utf8String::from("๐œ ๐”๐‡๐๐€๐ก๐‡๐“ ๐™๐Š๐ก๐๐“/๐๐‡๐—๐Š๐ค๐” ๐’๐‹๐— ๐’๐Œ ๐œ ๐ก๐€๐–๐‡๐ค๐“๐ ๐ฑ๐‘‚ ๐‘„ ๐”๐‡๐๐€๐ก๐‡๐“ ๐๐†๐…๐ค๐†๐š๐Š๐ก๐๐†๐“๐†");
362        // Changes length when case changes
363        // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L226-L232
364        let varying_length = Utf8String::from("zศบศพ");
365        // There doesn't appear to be any RTL scripts that have cases, but might as well make sure
366        let rtl = Utf8String::from("ู…ุฑุญุจุง ุงู„ุฎุฑุดูˆู");
367
368        let capitalize: fn(&Utf8String) -> Utf8String = |value: &Utf8String| {
369            let mut value = value.clone();
370            value.make_capitalized();
371            value
372        };
373        let lowercase: fn(&Utf8String) -> Utf8String = |value: &Utf8String| {
374            let mut value = value.clone();
375            value.make_lowercase();
376            value
377        };
378        let uppercase: fn(&Utf8String) -> Utf8String = |value: &Utf8String| {
379            let mut value = value.clone();
380            value.make_uppercase();
381            value
382        };
383        let swapcase: fn(&Utf8String) -> Utf8String = |value: &Utf8String| {
384            let mut value = value.clone();
385            value.make_swapcase();
386            value
387        };
388
389        assert_eq!(capitalize(&sharp_s), "SS");
390        assert_eq!(capitalize(&tomorrow), "ฮ‘ฯฯฮนฮฟ");
391        assert_eq!(capitalize(&year), "ฮˆฯ„ฮฟฯ‚");
392        assert_eq!(
393            capitalize(&two_byte_chars),
394            "๐œ ๐ผ๐ฏ๐‘…๐จ๐‘‰๐ฏ๐ป ๐‘๐ฒ๐‘‰๐‘…๐ป/๐‘…๐ฏ๐ฟ๐ฒ๐‘Œ๐ผ ๐บ๐ณ๐ฟ ๐บ๐ด ๐‘„ ๐‘‰๐จ๐พ๐ฏ๐‘Œ๐ป๐‘… ๐ฑ๐‘‚ ๐‘„ ๐ผ๐ฏ๐‘…๐จ๐‘‰๐ฏ๐ป ๐ท๐ฎ๐ญ๐‘Œ๐ฎ๐‘‚๐ฒ๐‘‰๐‘…๐ฎ๐ป๐ฎ"
395        );
396        assert_eq!(capitalize(&varying_length), "Zโฑฅโฑฆ");
397        assert_eq!(capitalize(&rtl), "ู…ุฑุญุจุง ุงู„ุฎุฑุดูˆู");
398
399        assert_eq!(lowercase(&sharp_s), "รŸ");
400        assert_eq!(lowercase(&tomorrow), "ฮฑฯฯฮนฮฟ");
401        assert_eq!(lowercase(&year), "ฮญฯ„ฮฟฯ‚");
402        assert_eq!(
403            lowercase(&two_byte_chars),
404            "๐‘„ ๐ผ๐ฏ๐‘…๐จ๐‘‰๐ฏ๐ป ๐‘๐ฒ๐‘‰๐‘…๐ป/๐‘…๐ฏ๐ฟ๐ฒ๐‘Œ๐ผ ๐บ๐ณ๐ฟ ๐บ๐ด ๐‘„ ๐‘‰๐จ๐พ๐ฏ๐‘Œ๐ป๐‘… ๐ฑ๐‘‚ ๐‘„ ๐ผ๐ฏ๐‘…๐จ๐‘‰๐ฏ๐ป ๐ท๐ฎ๐ญ๐‘Œ๐ฎ๐‘‚๐ฒ๐‘‰๐‘…๐ฎ๐ป๐ฎ"
405        );
406        assert_eq!(lowercase(&varying_length), "zโฑฅโฑฆ");
407        assert_eq!(lowercase(&rtl), "ู…ุฑุญุจุง ุงู„ุฎุฑุดูˆู");
408
409        assert_eq!(uppercase(&sharp_s), "SS");
410        assert_eq!(uppercase(&tomorrow), "ฮ‘ฮŽฮกฮ™ฮŸ");
411        assert_eq!(uppercase(&year), "ฮˆฮคฮŸฮฃ");
412        assert_eq!(
413            uppercase(&two_byte_chars),
414            "๐œ ๐”๐‡๐๐€๐ก๐‡๐“ ๐™๐Š๐ก๐๐“/๐๐‡๐—๐Š๐ค๐” ๐’๐‹๐— ๐’๐Œ ๐œ ๐ก๐€๐–๐‡๐ค๐“๐ ๐‰๐š ๐œ ๐”๐‡๐๐€๐ก๐‡๐“ ๐๐†๐…๐ค๐†๐š๐Š๐ก๐๐†๐“๐†"
415        );
416        assert_eq!(uppercase(&varying_length), "Zศบศพ");
417        assert_eq!(uppercase(&rtl), "ู…ุฑุญุจุง ุงู„ุฎุฑุดูˆู");
418
419        let sharp_s = Utf8String::from("SS");
420        let tomorrow = Utf8String::from("ฮ‘ฯฯฮนฮฟ");
421        let year = Utf8String::from("ฮˆฯ„ฮฟฯ‚");
422        // This next line is the titlecase version of the earlier two-byte string:
423        let two_byte_chars = Utf8String::from("๐œ ๐”๐ฏ๐‘…๐จ๐‘‰๐ฏ๐ป ๐™๐ฒ๐‘‰๐‘…๐ป/๐‘…๐ฏ๐ฟ๐ฒ๐‘Œ๐ผ ๐’๐ณ๐ฟ ๐’๐ด ๐œ ๐ก๐จ๐พ๐ฏ๐‘Œ๐ป๐‘… ๐‰๐‘‚ ๐œ ๐”๐ฏ๐‘…๐จ๐‘‰๐ฏ๐ป ๐๐ฎ๐ญ๐‘Œ๐ฎ๐‘‚๐ฒ๐‘‰๐‘…๐ฎ๐ป๐ฎ");
424        let varying_length = Utf8String::from("Zโฑฅโฑฆ");
425        let rtl = Utf8String::from("ู…ุฑุญุจุง ุงู„ุฎุฑุดูˆู");
426        assert_eq!(swapcase(&sharp_s), "ss", "swapcase(SS) failed");
427        assert_eq!(swapcase(&tomorrow), "ฮฑฮŽฮกฮ™ฮŸ", "swapcase(ฮ‘ฯฯฮนฮฟ) failed");
428        assert_eq!(swapcase(&year), "ฮญฮคฮŸฮฃ", "swapcase(ฮˆฯ„ฮฟฯ‚) failed");
429        assert_eq!(
430            swapcase(&two_byte_chars),
431            "๐‘„ ๐ผ๐‡๐๐€๐ก๐‡๐“ ๐‘๐Š๐ก๐๐“/๐๐‡๐—๐Š๐ค๐” ๐บ๐‹๐— ๐บ๐Œ ๐‘„ ๐‘‰๐€๐–๐‡๐ค๐“๐ ๐ฑ๐š ๐‘„ ๐ผ๐‡๐๐€๐ก๐‡๐“ ๐ท๐†๐…๐ค๐†๐š๐Š๐ก๐๐†๐“๐†"
432        );
433        assert_eq!(swapcase(&varying_length), "zศบศพ", "swapcase(Zโฑฅโฑฆ) mismatch");
434        assert_eq!(swapcase(&rtl), "ู…ุฑุญุจุง ุงู„ุฎุฑุดูˆู", "swapcase(ู…ุฑุญุจุง ุงู„ุฎุฑุดูˆู) mismatch");
435    }
436
437    #[test]
438    fn casing_utf8_string_invalid_utf8() {
439        let mut s = Utf8String::from(b"\xFF\xFE");
440
441        s.make_capitalized();
442        assert_eq!(s, &b"\xFF\xFE"[..]);
443
444        s.make_lowercase();
445        assert_eq!(s, &b"\xFF\xFE"[..]);
446
447        s.make_uppercase();
448        assert_eq!(s, &b"\xFF\xFE"[..]);
449
450        s.make_swapcase();
451        assert_eq!(s, &b"\xFF\xFE"[..]);
452    }
453
454    #[test]
455    fn casing_utf8_string_unicode_replacement_character() {
456        let mut s = Utf8String::from("๏ฟฝ");
457
458        s.make_capitalized();
459        assert_eq!(s, "๏ฟฝ");
460
461        s.make_lowercase();
462        assert_eq!(s, "๏ฟฝ");
463
464        s.make_uppercase();
465        assert_eq!(s, "๏ฟฝ");
466
467        s.make_swapcase();
468        assert_eq!(s, "๏ฟฝ");
469    }
470
471    #[test]
472    fn chr_does_not_return_more_than_one_byte_for_invalid_utf8() {
473        // ```ruby
474        // [3.0.1] > "\xF0\x9F\x87".chr
475        // => "\xF0"
476        // ```
477        //
478        // Per `bstr`:
479        //
480        // The bytes `\xF0\x9F\x87` could lead to a valid UTF-8 sequence, but 3 of them
481        // on their own are invalid. Only one replacement codepoint is substituted,
482        // which demonstrates the "substitution of maximal subparts" strategy.
483        let s = Utf8String::from(b"\xF0\x9F\x87");
484        assert_eq!(s.chr(), b"\xF0");
485    }
486
487    #[test]
488    fn get_char_slice_valid_range() {
489        let s = Utf8String::from(b"a\xF0\x9F\x92\x8E\xFF".to_vec()); // `"a๐Ÿ’Ž\xFF"`
490        assert_eq!(s.get_char_slice(0..0), Some(Utf8Str::empty()));
491        assert_eq!(s.get_char_slice(0..1), Some(Utf8Str::new(b"a")));
492        assert_eq!(s.get_char_slice(0..2), Some(Utf8Str::new("a๐Ÿ’Ž")));
493        assert_eq!(s.get_char_slice(0..3), Some(Utf8Str::new(b"a\xF0\x9F\x92\x8E\xFF")));
494        assert_eq!(s.get_char_slice(0..4), Some(Utf8Str::new(b"a\xF0\x9F\x92\x8E\xFF")));
495        assert_eq!(s.get_char_slice(1..1), Some(Utf8Str::empty()));
496        assert_eq!(s.get_char_slice(1..2), Some(Utf8Str::new("๐Ÿ’Ž")));
497        assert_eq!(s.get_char_slice(1..3), Some(Utf8Str::new(b"\xF0\x9F\x92\x8E\xFF")));
498    }
499
500    #[test]
501    #[expect(clippy::reversed_empty_ranges, reason = "testing behavior of reversed ranges")]
502    fn get_char_slice_invalid_range() {
503        let s = Utf8String::from(b"a\xF0\x9F\x92\x8E\xFF".to_vec()); // `"a๐Ÿ’Ž\xFF"`
504        assert_eq!(s.get_char_slice(4..5), None);
505        assert_eq!(s.get_char_slice(4..1), None);
506        assert_eq!(s.get_char_slice(3..1), Some(Utf8Str::empty()));
507        assert_eq!(s.get_char_slice(2..1), Some(Utf8Str::empty()));
508        assert_eq!(s.get_char_slice(7..10), None);
509        assert_eq!(s.get_char_slice(10..8), None);
510        assert_eq!(s.get_char_slice(10..5), None);
511        assert_eq!(s.get_char_slice(10..2), None);
512    }
513
514    #[test]
515    fn index_with_default_offset() {
516        let s = Utf8String::from("f๐Ÿ’Žoo");
517        assert_eq!(s.index("f".as_bytes(), 0), Some(0));
518        assert_eq!(s.index("o".as_bytes(), 0), Some(2));
519        assert_eq!(s.index("oo".as_bytes(), 0), Some(2));
520        assert_eq!(s.index("ooo".as_bytes(), 0), None);
521    }
522
523    #[test]
524    fn index_with_different_offset() {
525        let s = Utf8String::from("f๐Ÿ’Žoo");
526        assert_eq!(s.index("o".as_bytes(), 1), Some(2));
527        assert_eq!(s.index("o".as_bytes(), 2), Some(2));
528        assert_eq!(s.index("o".as_bytes(), 3), Some(3));
529        assert_eq!(s.index("o".as_bytes(), 4), None);
530    }
531
532    #[test]
533    fn rindex_with_default_offset() {
534        let s = Utf8String::from("f๐Ÿ’Žoo");
535        assert_eq!(s.rindex("f".as_bytes(), 3), Some(0));
536        assert_eq!(s.rindex("o".as_bytes(), 3), Some(3));
537        assert_eq!(s.rindex("oo".as_bytes(), 3), Some(2));
538        assert_eq!(s.rindex("ooo".as_bytes(), 3), None);
539    }
540
541    #[test]
542    fn rindex_with_different_offset() {
543        let s = Utf8String::from("f๐Ÿ’Žoo");
544        assert_eq!(s.rindex("o".as_bytes(), 4), Some(3));
545        assert_eq!(s.rindex("o".as_bytes(), 3), Some(3));
546        assert_eq!(s.rindex("o".as_bytes(), 2), Some(2));
547        assert_eq!(s.rindex("o".as_bytes(), 1), None);
548        assert_eq!(s.rindex("o".as_bytes(), 0), None);
549    }
550
551    #[test]
552    fn index_and_rindex_support_invalid_utf8_in_needle() {
553        // Invalid UTF-8 in needle
554        let needle = &"๐Ÿ’Ž".as_bytes()[..3];
555
556        assert_eq!(Utf8String::from("f๐Ÿ’Žoo").index(needle, 0), None); // FIXME: Currently `Some(1)`
557        assert_eq!(Utf8String::from("f๐Ÿ’Žoo").rindex(needle, 3), None); // FIXME: Currently `Some(1)`
558    }
559
560    #[test]
561    fn index_and_rindex_support_invalid_utf8_in_haystack() {
562        // Invalid UTF-8 in haystack
563        let mut haystack = Vec::new();
564        haystack.extend_from_slice(b"f");
565        haystack.extend_from_slice(&"๐Ÿ’Ž".as_bytes()[..2]);
566        haystack.extend_from_slice(b"oo");
567        let haystack = Utf8String::from(haystack);
568
569        assert_eq!(haystack.index("๐Ÿ’Ž".as_bytes(), 0), None);
570        assert_eq!(haystack.rindex("๐Ÿ’Ž".as_bytes(), 3), None);
571    }
572
573    #[test]
574    fn index_empties() {
575        // ```console
576        // [3.2.2] > "".index ""
577        // => 0
578        // [3.2.2] > "".index "a"
579        // => nil
580        // [3.2.2] > "a".index ""
581        // => 0
582        // ```
583        let s = Utf8String::from("");
584        assert_eq!(s.index(b"", 0), Some(0));
585
586        assert_eq!(s.index(b"a", 0), None);
587
588        let s = Utf8String::from("a");
589        assert_eq!(s.index(b"", 0), Some(0));
590    }
591
592    #[test]
593    fn rindex_empties() {
594        // ```console
595        // [3.2.2] > "".rindex ""
596        // => 0
597        // [3.2.2] > "".rindex "a"
598        // => nil
599        // [3.2.2] > "a".rindex ""
600        // => 1
601        // ```
602        let s = Utf8String::from("");
603        assert_eq!(s.rindex(b"", usize::MAX), Some(0));
604        assert_eq!(s.rindex(b"", 1), Some(0));
605        assert_eq!(s.rindex(b"", 0), Some(0));
606
607        assert_eq!(s.rindex(b"a", usize::MAX), None);
608        assert_eq!(s.rindex(b"a", 1), None);
609        assert_eq!(s.rindex(b"a", 0), None);
610
611        let s = Utf8String::from("a");
612        assert_eq!(s.rindex(b"", usize::MAX), Some(1));
613        assert_eq!(s.rindex(b"", 1), Some(1));
614    }
615}