spinoso_string/enc/utf8/
case_change.rs

1use alloc::vec::Vec;
2
3use bstr::ByteVec;
4
5use crate::case_folding::CaseFoldingEffect;
6
7/// Transform this UTF-8 buffer to "capitalized", returning a new `Vec<u8>`
8/// and a [`CaseFoldingEffect`].
9///
10/// "Capitalized" here means:
11///  - The **first** codepoint is converted to uppercase
12///  - All subsequent codepoints are converted to lowercase
13///
14/// Invalid UTF‐8 bytes are passed through unchanged. If any valid codepoint
15/// changes length or content, we mark [`CaseFoldingEffect::Changed`].
16///
17/// # Compatibility Notes
18///
19/// This function should use titlecase mapping for the initial character.
20pub fn to_utf8_capitalized(mut bytes: &[u8]) -> (Vec<u8>, CaseFoldingEffect) {
21    // This allocation assumes that in the common case, capitalizing and
22    // lower-casing `char`s do not change the length of the `String`.
23    //
24    // Use a `Vec` here instead of a `Buf` to ensure at most one alloc fix-up
25    // happens instead of alloc fix-ups being O(chars).
26    let mut replacement = Vec::with_capacity(bytes.len());
27    let mut effect = CaseFoldingEffect::Unchanged;
28
29    let (ch, size) = bstr::decode_utf8(bytes);
30    // SAFETY: bstr guarantees that the size is within the bounds of the slice.
31    let (chunk, remainder) = unsafe { bytes.split_at_unchecked(size) };
32    bytes = remainder;
33
34    if let Some(ch) = ch {
35        // Converting a UTF-8 character to uppercase may yield multiple
36        // codepoints.
37        let old = ch;
38        for ch in ch.to_uppercase() {
39            if ch != old {
40                effect = CaseFoldingEffect::Modified;
41            }
42            replacement.push_char(ch);
43        }
44    } else {
45        replacement.extend_from_slice(chunk);
46    }
47
48    while !bytes.is_empty() {
49        let (ch, size) = bstr::decode_utf8(bytes);
50        // SAFETY: bstr guarantees that the size is within the bounds of the slice.
51        let (chunk, remainder) = unsafe { bytes.split_at_unchecked(size) };
52        bytes = remainder;
53
54        if let Some(ch) = ch {
55            // Converting a UTF-8 character to lowercase may yield
56            // multiple codepoints.
57            let old = ch;
58            for ch in ch.to_lowercase() {
59                if ch != old {
60                    effect = CaseFoldingEffect::Modified;
61                }
62                replacement.push_char(ch);
63            }
64        } else {
65            replacement.extend_from_slice(chunk);
66        }
67    }
68
69    (replacement, effect)
70}
71
72/// Transform this UTF-8 buffer to lowercase, returning a new `Vec<u8>` and a
73/// [`CaseFoldingEffect`].
74///
75/// Invalid UTF‐8 bytes are passed through unchanged. If any valid codepoint
76/// changes length or content, we mark [`CaseFoldingEffect::Changed`].
77pub fn to_utf8_lowercase(mut bytes: &[u8]) -> (Vec<u8>, CaseFoldingEffect) {
78    // This allocation assumes that in the common case, lower-casing `char`s do
79    // not change the length of the `String`.
80    //
81    // Use a `Vec` here instead of a `Buf` to ensure at most one alloc fix-up
82    // happens instead of alloc fix-ups being O(chars).
83    let mut replacement = Vec::with_capacity(bytes.len());
84    let mut effect = CaseFoldingEffect::Unchanged;
85
86    while !bytes.is_empty() {
87        // Decode the next UTF-8 codepoint
88        let (ch, size) = bstr::decode_utf8(bytes);
89
90        // SAFETY: bstr guarantees that the size is within the bounds of the slice.
91        let (chunk, remainder) = unsafe { bytes.split_at_unchecked(size) };
92        bytes = remainder;
93
94        let Some(ch) = ch else {
95            // Not valid UTF-8 at this position, so pass bytes through unchanged
96            replacement.extend_from_slice(chunk);
97            continue;
98        };
99        // Converting a UTF-8 character to lowercase may yield multiple
100        // codepoints.
101        let old = ch;
102        for ch in old.to_lowercase() {
103            if ch != old {
104                effect = CaseFoldingEffect::Modified;
105            }
106            replacement.push_char(ch);
107        }
108    }
109
110    (replacement, effect)
111}
112
113/// Transform this UTF-8 buffer to uppercase, returning a new `Vec<u8>` and a
114/// [`CaseFoldingEffect`].
115///
116/// Invalid UTF‐8 bytes are passed through unchanged. If any valid codepoint
117/// changes length or content, we mark [`CaseFoldingEffect::Changed`].
118pub fn to_utf8_uppercase(mut bytes: &[u8]) -> (Vec<u8>, CaseFoldingEffect) {
119    // This allocation assumes that in the common case, upper-casing `char`s do
120    // not change the length of the `String`.
121    //
122    // Use a `Vec` here instead of a `Buf` to ensure at most one alloc fix-up
123    // happens instead of alloc fix-ups being O(chars).
124    let mut replacement = Vec::with_capacity(bytes.len());
125    let mut effect = CaseFoldingEffect::Unchanged;
126
127    while !bytes.is_empty() {
128        // Decode the next UTF-8 codepoint
129        let (ch, size) = bstr::decode_utf8(bytes);
130
131        // SAFETY: bstr guarantees that the size is within the bounds of the slice.
132        let (chunk, remainder) = unsafe { bytes.split_at_unchecked(size) };
133        bytes = remainder;
134
135        let Some(ch) = ch else {
136            // Not valid UTF-8 at this position, so pass bytes through unchanged
137            replacement.extend_from_slice(chunk);
138            continue;
139        };
140        // Converting a UTF-8 character to lowercase may yield multiple
141        // codepoints.
142        let old = ch;
143        for ch in old.to_uppercase() {
144            if ch != old {
145                effect = CaseFoldingEffect::Modified;
146            }
147            replacement.push_char(ch);
148        }
149    }
150
151    (replacement, effect)
152}
153
154/// Transform this UTF-8 buffer to "swapped case", returning a new `Vec<u8>` and a
155/// [`CaseFoldingEffect`].
156///
157/// Here "swapped case" means:
158/// - Uppercase characters are converted to lowercase
159/// - Lowercase characters are converted to uppercase
160///
161/// Invalid UTF‐8 bytes are passed through unchanged. If any valid codepoint
162/// changes length or content, we mark [`CaseFoldingEffect::Changed`].
163pub fn to_utf8_swapcase(mut bytes: &[u8]) -> (Vec<u8>, CaseFoldingEffect) {
164    let mut replacement = Vec::with_capacity(bytes.len());
165    let mut effect = CaseFoldingEffect::Unchanged;
166
167    while !bytes.is_empty() {
168        let (ch, size) = bstr::decode_utf8(bytes);
169        // SAFETY: bstr guarantees `size` is in-bounds.
170        let (chunk, remainder) = unsafe { bytes.split_at_unchecked(size) };
171        bytes = remainder;
172
173        let Some(ch) = ch else {
174            // Not valid UTF-8 at this position, so pass bytes through
175            // unchanged
176            replacement.extend_from_slice(chunk);
177            continue;
178        };
179
180        // If `ch` is uppercase, convert to lowercase; if lowercase, convert to
181        // uppercase; otherwise, push as-is.
182        //
183        // FIXME: titlecase characters are not handled correctly.
184        // FIXME: <https://github.com/artichoke/artichoke/issues/2834>
185        match ch {
186            old if old.is_lowercase() => {
187                for ch in old.to_uppercase() {
188                    if ch != old {
189                        effect = CaseFoldingEffect::Modified;
190                    }
191                    replacement.push_char(ch);
192                }
193            }
194            old if old.is_uppercase() => {
195                for ch in old.to_lowercase() {
196                    if ch != old {
197                        effect = CaseFoldingEffect::Modified;
198                    }
199                    replacement.push_char(ch);
200                }
201            }
202            old => replacement.push_char(old),
203        }
204    }
205
206    (replacement, effect)
207}
208
209#[cfg(test)]
210mod tests {
211
212    use bstr::ByteSlice;
213
214    use super::*;
215    use crate::case_folding::CaseFoldingEffect;
216
217    // Helper that runs a single test:
218    #[track_caller]
219    fn run_test<F>(func: F, input: &[u8], expected: &[u8], expect_fold: CaseFoldingEffect)
220    where
221        F: FnOnce(&[u8]) -> (Vec<u8>, CaseFoldingEffect),
222    {
223        let (output, effect) = func(input);
224        assert_eq!(
225            effect,
226            expect_fold,
227            "Expected folding effect {:?} for input {:?}, got {:?} with output {:?}",
228            expect_fold,
229            input.as_bstr(),
230            effect,
231            output.as_bstr(),
232        );
233
234        assert_eq!(
235            output,
236            expected,
237            "Transformed result mismatch:\n  input = {:?}\n  expected = {:?}\n  actual   = {:?}",
238            input.as_bstr(),
239            expected.as_bstr(),
240            output.as_bstr()
241        );
242    }
243
244    #[test]
245    fn test_to_utf8_capitalized() {
246        let cases: [(&[u8], &[u8], CaseFoldingEffect); 9] = [
247            // 0) Empty
248            (b"", b"", CaseFoldingEffect::Unchanged),
249            // 1) ASCII
250            (b"hello WORLD", b"Hello world", CaseFoldingEffect::Modified),
251            (b"Hello world", b"Hello world", CaseFoldingEffect::Unchanged),
252            (b"1234", b"1234", CaseFoldingEffect::Unchanged),
253            // 2) partial invalid
254            (b"\xFFabc", b"\xFFabc", CaseFoldingEffect::Unchanged),
255            // 3) expansions with ß
256            // 'ß' => upcase => "SS", 'test' => 'test' => "SStest"
257            ("ßtest".as_bytes(), b"SStest", CaseFoldingEffect::Modified),
258            // 4) Greek
259            ("αγαπώ".as_bytes(), "Αγαπώ".as_bytes(), CaseFoldingEffect::Modified),
260            // 5) Non-turkic folding mode for dotted i
261            ("işaret".as_bytes(), "Işaret".as_bytes(), CaseFoldingEffect::Modified),
262            // 6) Chinese
263            (
264                "你好世界".as_bytes(),
265                "你好世界".as_bytes(),
266                CaseFoldingEffect::Unchanged,
267            ),
268        ];
269        for (input, expected, effect) in cases {
270            run_test(to_utf8_capitalized, input, expected, effect);
271        }
272    }
273
274    #[test]
275    fn test_to_utf8_capitalized_dz_digraph() {
276        let cases: [(&[u8], &[u8], CaseFoldingEffect); 3] = [
277            ("DŽ".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Unchanged),
278            ("Dž".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Modified),
279            ("dž".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Modified),
280        ];
281        for (input, expected, effect) in cases {
282            run_test(to_utf8_capitalized, input, expected, effect);
283        }
284    }
285
286    #[test]
287    fn test_to_utf8_lowercase() {
288        let cases: [(&[u8], &[u8], CaseFoldingEffect); 9] = [
289            (b"", b"", CaseFoldingEffect::Unchanged),
290            (b"HELLO", b"hello", CaseFoldingEffect::Modified),
291            // 'ß' won't expand for lowercase, but 'T','E','S','T' => 't','e','s','t'
292            ("ßTEST".as_bytes(), "ßtest".as_bytes(), CaseFoldingEffect::Modified),
293            (b"\xFFhello", b"\xFFhello", CaseFoldingEffect::Unchanged),
294            (b"Hello world", b"hello world", CaseFoldingEffect::Modified),
295            (b"hello world", b"hello world", CaseFoldingEffect::Unchanged),
296            // Turkish dotted I => 'İ' => 'i̇'
297            ("İŞARET".as_bytes(), "i̇şaret".as_bytes(), CaseFoldingEffect::Modified),
298            ("你好".as_bytes(), "你好".as_bytes(), CaseFoldingEffect::Unchanged),
299            ("ΑΓΑΠΩ".as_bytes(), "αγαπω".as_bytes(), CaseFoldingEffect::Modified),
300        ];
301        for (input, expected, effect) in cases {
302            run_test(to_utf8_lowercase, input, expected, effect);
303        }
304    }
305
306    #[test]
307    fn test_to_utf8_lowercase_dz_digraph() {
308        let cases: [(&[u8], &[u8], CaseFoldingEffect); 3] = [
309            ("DŽ".as_bytes(), "dž".as_bytes(), CaseFoldingEffect::Modified),
310            ("Dž".as_bytes(), "dž".as_bytes(), CaseFoldingEffect::Modified),
311            ("dž".as_bytes(), "dž".as_bytes(), CaseFoldingEffect::Unchanged),
312        ];
313        for (input, expected, effect) in cases {
314            run_test(to_utf8_lowercase, input, expected, effect);
315        }
316    }
317
318    #[test]
319    fn test_to_utf8_uppercase() {
320        let cases: [(&[u8], &[u8], CaseFoldingEffect); 8] = [
321            (b"", b"", CaseFoldingEffect::Unchanged),
322            (b"hello", b"HELLO", CaseFoldingEffect::Modified),
323            ("ßtest".as_bytes(), b"SSTEST", CaseFoldingEffect::Modified),
324            (b"hello world", b"HELLO WORLD", CaseFoldingEffect::Modified),
325            (b"HELLO", b"HELLO", CaseFoldingEffect::Unchanged),
326            // Non-turkic folding mode for dotted i
327            ("işaret".as_bytes(), "IŞARET".as_bytes(), CaseFoldingEffect::Modified),
328            ("你好".as_bytes(), "你好".as_bytes(), CaseFoldingEffect::Unchanged),
329            // Greek expansions
330            ("αγαπώ".as_bytes(), "ΑΓΑΠΏ".as_bytes(), CaseFoldingEffect::Modified),
331        ];
332        for (input, expected, effect) in cases {
333            run_test(to_utf8_uppercase, input, expected, effect);
334        }
335    }
336
337    #[test]
338    fn test_to_utf8_uppercase_dz_digraph() {
339        let cases: [(&[u8], &[u8], CaseFoldingEffect); 3] = [
340            ("DŽ".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Unchanged),
341            ("Dž".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Modified),
342            ("dž".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Modified),
343        ];
344        for (input, expected, effect) in cases {
345            run_test(to_utf8_uppercase, input, expected, effect);
346        }
347    }
348
349    #[test]
350    fn test_to_utf8_swapcase() {
351        let cases: [(&[u8], &[u8], CaseFoldingEffect); 8] = [
352            (b"", b"", CaseFoldingEffect::Unchanged),
353            // ASCII
354            (b"hEllo", b"HeLLO", CaseFoldingEffect::Modified),
355            (b"1234", b"1234", CaseFoldingEffect::Unchanged),
356            // expansions
357            ("ßTEST".as_bytes(), "SStest".as_bytes(), CaseFoldingEffect::Modified),
358            (b"\xFFabc", b"\xFFABC", CaseFoldingEffect::Modified),
359            // Non-turkic folding mode for dotted i
360            ("iŞARET".as_bytes(), "Işaret".as_bytes(), CaseFoldingEffect::Modified),
361            // Chinese => no changes
362            ("你好".as_bytes(), "你好".as_bytes(), CaseFoldingEffect::Unchanged),
363            // Greek expansions
364            ("αγΑΠΏ".as_bytes(), "ΑΓαπώ".as_bytes(), CaseFoldingEffect::Modified),
365        ];
366        for (input, expected, effect) in cases {
367            run_test(to_utf8_swapcase, input, expected, effect);
368        }
369    }
370
371    #[test]
372    // currently lacking support for swapping the case of titlecase characters.
373    // See: <https://github.com/artichoke/artichoke/issues/2834>
374    #[should_panic = r#"Expected folding effect Modified for input "Dž", got Unchanged with output "Dž""#]
375    fn test_to_utf8_swapcase_dz_digraph() {
376        let cases: [(&[u8], &[u8], CaseFoldingEffect); 4] = [
377            ("DŽ".as_bytes(), "dž".as_bytes(), CaseFoldingEffect::Modified),
378            ("Dž".as_bytes(), "dŽ".as_bytes(), CaseFoldingEffect::Modified),
379            ("dŽ".as_bytes(), "Dž".as_bytes(), CaseFoldingEffect::Modified),
380            ("dž".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Modified),
381        ];
382        for (input, expected, effect) in cases {
383            run_test(to_utf8_swapcase, input, expected, effect);
384        }
385    }
386}