spinoso_string/enc/
binascii.rs

1//! Shared encoding and decoding logic for binary and ASCII-8BIT strings.
2
3use crate::case_folding::CaseFoldingEffect;
4
5/// Convert this ASCII or binary string to a "capitalized" form in-place,
6/// returning whether any changes were made.
7///
8/// This routine ensures the first character is upcased, while every subsequent
9/// character is downcased. The function scans only until it detects the first
10/// difference from the current capitalized form:
11///
12/// 1. If the very first byte differs once converted to uppercase, we mark
13///    `CaseFoldingEffect::Changed`, downcase the remainder of the slice in one
14///    go, and return immediately.
15/// 2. If the first byte produces no change, the function delegates to
16///    [`make_lowercase`] for the rest of the slice, again short-circuiting
17///    once a difference is found.
18///
19/// If the entire slice is already in "capitalized" form, this function returns
20/// [`CaseFoldingEffect::Unchanged`].
21///
22/// [`make_lowercase`]: crate::binascii::make_lowercase
23pub fn make_capitalized(s: &mut [u8]) -> CaseFoldingEffect {
24    // Capitalize means:
25    //  1) Upcase the first character
26    //  2) Downcase the rest
27    //
28    // Only do byte equality checks until the first difference is found.
29    // If we find a difference on the first char, we mark `Changed` and
30    // immediately do a no-check downcase on the remainder.
31    // If we find no difference for the first char, then we proceed similarly
32    // for the remainder with the same short-circuit logic.
33
34    // If the buffer is empty => Unchanged
35    let Some((first, s)) = s.split_first_mut() else {
36        return CaseFoldingEffect::Unchanged;
37    };
38
39    // Upcase the first char
40    let old_first = *first;
41    let new_first = old_first.to_ascii_uppercase();
42    *first = new_first;
43
44    if old_first != new_first {
45        // We found a difference in the first char. Now downcase the rest, no
46        // further equality checks.
47        s.make_ascii_lowercase();
48        return CaseFoldingEffect::Modified;
49    }
50
51    // If we get here, the first char had no change. For the remainder of the
52    // string, let's do a single-pass approach and delegate to
53    // `make_lowercase()`.
54    make_lowercase(s)
55}
56
57/// Convert this ASCII or binary string to lowercase in-place, returning
58/// whether any changes were made.
59///
60/// The function scans each byte until it finds a character that actually
61/// needs to be changed (e.g., `A` → `a`). Once we detect the first changed
62/// byte, we call `make_ascii_lowercase()` on the rest of the slice and
63/// immediately return `CaseFoldingEffect::Changed`.
64///
65/// If we reach the end of the string without finding any uppercase byte, we
66/// return `CaseFoldingEffect::Unchanged`.
67pub fn make_lowercase(mut s: &mut [u8]) -> CaseFoldingEffect {
68    loop {
69        // Split off the first byte. If there is no first byte (the slice is
70        // empty), then we've scanned the entire string without encountering a
71        // change, so we return `Unchanged`.
72        let Some((head, tail)) = s.split_first_mut() else {
73            return CaseFoldingEffect::Unchanged;
74        };
75        let old = *head;
76        let new = old.to_ascii_lowercase();
77        s = tail;
78
79        // Overwrite this byte with its lowercase version.
80        *head = new;
81
82        // If this byte actually changed (e.g., old was 'A', new is 'a'), then
83        // for the rest of the string we skip further comparisons and just do
84        // the direct ASCII-lowercasing.
85        if old != new {
86            s.make_ascii_lowercase();
87            return CaseFoldingEffect::Modified;
88        }
89    }
90}
91
92/// Convert this ASCII or binary string to uppercase in-place, returning
93/// whether any changes were made.
94///
95/// The function scans each byte until it finds a character that actually
96/// needs to be changed (e.g., `a` → `A`). Once we detect the first changed
97/// byte, we call `make_ascii_uppercase()` on the rest of the slice and
98/// immediately return `CaseFoldingEffect::Changed`.
99///
100/// If we reach the end of the string without finding any lowercase byte, we
101/// return `CaseFoldingEffect::Unchanged`.
102pub fn make_uppercase(mut s: &mut [u8]) -> CaseFoldingEffect {
103    loop {
104        let Some((head, tail)) = s.split_first_mut() else {
105            return CaseFoldingEffect::Unchanged;
106        };
107        let old = *head;
108        let new = old.to_ascii_uppercase();
109        s = tail;
110
111        *head = new;
112
113        if old != new {
114            s.make_ascii_uppercase();
115            return CaseFoldingEffect::Modified;
116        }
117    }
118}
119
120/// Convert this ASCII or binary string to “swapcase” in-place, returning
121/// whether any changes were made.
122///
123/// “Swapcase” means each ASCII-lowercase byte is converted to uppercase,
124/// and each ASCII-uppercase byte is converted to lowercase; any other
125/// byte is left unchanged. The algorithm short-circuits upon detecting
126/// the first modified byte:
127///
128/// 1. It scans each byte, comparing the swapped version to the original.
129/// 2. Once the first difference is found, the rest of the bytes are
130///    “swapcased” without further equality checks, and
131///    [`CaseFoldingEffect::Changed`] is returned.
132/// 3. If the entire slice is processed with no changes, returns
133///    [`CaseFoldingEffect::Unchanged`].
134pub fn make_swapcase(mut s: &mut [u8]) -> CaseFoldingEffect {
135    #[inline]
136    fn to_swapcase(b: u8) -> u8 {
137        if b.is_ascii_lowercase() {
138            b.to_ascii_uppercase()
139        } else if b.is_ascii_uppercase() {
140            b.to_ascii_lowercase()
141        } else {
142            b
143        }
144    }
145
146    loop {
147        let Some((head, tail)) = s.split_first_mut() else {
148            return CaseFoldingEffect::Unchanged;
149        };
150        let old = *head;
151        let new = to_swapcase(old);
152        s = tail;
153
154        *head = new;
155
156        if *head != old {
157            // We found a difference => do a "no-check" swapcase for the rest
158            for b in s {
159                let old = *b;
160                let new = to_swapcase(old);
161                *b = new;
162            }
163            return CaseFoldingEffect::Modified;
164        }
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use bstr::ByteSlice;
171
172    use super::*;
173    use crate::case_folding::CaseFoldingEffect;
174
175    #[track_caller]
176    fn run_test<F>(func: F, input: &[u8], expected: &[u8], expect_fold: CaseFoldingEffect)
177    where
178        F: FnOnce(&mut [u8]) -> CaseFoldingEffect,
179    {
180        let mut data = input.to_vec();
181        let fold = func(&mut data);
182        assert_eq!(fold, expect_fold, "CaseFoldingEffect mismatch on {:?}", input.as_bstr());
183        assert_eq!(
184            data.as_bstr(),
185            expected.as_bstr(),
186            "Result bytes mismatch on {:?}",
187            input.as_bstr()
188        );
189    }
190
191    // 1) make_capitalized
192    #[test]
193    fn test_make_capitalized() {
194        // Each test scenario:
195        // input, expected output, expected folding effect
196        let cases: [(&[u8], &[u8], CaseFoldingEffect); 7] = [
197            // empty => no changes
198            (b"", b"", CaseFoldingEffect::Unchanged),
199            // numeric only => no letters to change
200            (b"1234", b"1234", CaseFoldingEffect::Unchanged),
201            // spaces, tabs => no letters
202            (b" \t ", b" \t ", CaseFoldingEffect::Unchanged),
203            // control chars
204            (b"\x01\x02Hello", b"\x01\x02hello", CaseFoldingEffect::Modified),
205            // Actually let's see: first char is \x01, upcase => \x01 no difference, so skip?
206            // But second char is \x02 => no difference. Then 'H' -> 'H' (no difference?),
207            // 'e' -> 'E'(difference). So short-circuit => rest all downcased => "Ello" => "ello"?
208            // Actually we might want to carefully see how "make_capitalized" handles control chars at start...
209            // We'll pick an example that changes after the first alpha:
210            // We'll adapt next lines carefully or define it so it does or doesn't change.
211
212            // invalid ASCII bytes
213            (b"\xFFabc", b"\xFFabc", CaseFoldingEffect::Unchanged),
214            // The first char \xFF is not changed by `to_ascii_uppercase()`,
215            // so no difference => then 'a'->'A'(difference) => short-circuit => bc->"bc"?
216            // => "Abc"? Wait carefully. We'll define the final.
217
218            // We'll do simpler examples for clarity below:
219            (b"hello world", b"Hello world", CaseFoldingEffect::Modified),
220            (b"Hello", b"Hello", CaseFoldingEffect::Unchanged),
221        ];
222
223        for (input, expected, effect) in cases {
224            run_test(make_capitalized, input, expected, effect);
225        }
226    }
227
228    // 2) make_lowercase
229    #[test]
230    fn test_make_lowercase() {
231        let cases: [(&[u8], &[u8], CaseFoldingEffect); 9] = [
232            (b"", b"", CaseFoldingEffect::Unchanged),
233            (b"1234", b"1234", CaseFoldingEffect::Unchanged),
234            (b"HELLO", b"hello", CaseFoldingEffect::Modified),
235            (b"Hello1", b"hello1", CaseFoldingEffect::Modified),
236            // spaces + tab => no alpha
237            (b"   \t", b"   \t", CaseFoldingEffect::Unchanged),
238            // invalid ASCII
239            (b"\x80\x81HI", b"\x80\x81hi", CaseFoldingEffect::Modified),
240            // Greek bytes: they won't match ASCII 'A'..'Z', so no changes
241            (
242                b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
243                b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
244                CaseFoldingEffect::Unchanged,
245            ),
246            // turkic string with dotted I => no changes in ASCII
247            (b"\xc4\xb0 \xc4\xb1", b"\xc4\xb0 \xc4\xb1", CaseFoldingEffect::Unchanged),
248            // Chinese => no changes
249            (
250                b"\xe4\xbd\xa0\xe5\xa5\xbd",
251                b"\xe4\xbd\xa0\xe5\xa5\xbd",
252                CaseFoldingEffect::Unchanged,
253            ),
254        ];
255        for (input, expected, effect) in cases {
256            run_test(make_lowercase, input, expected, effect);
257        }
258    }
259
260    // 3) make_uppercase
261    #[test]
262    fn test_make_uppercase() {
263        let cases: [(&[u8], &[u8], CaseFoldingEffect); 9] = [
264            (b"", b"", CaseFoldingEffect::Unchanged),
265            (b"1234", b"1234", CaseFoldingEffect::Unchanged),
266            (b"hello", b"HELLO", CaseFoldingEffect::Modified),
267            (b"hEllo2", b"HELLO2", CaseFoldingEffect::Modified),
268            (b"   \t", b"   \t", CaseFoldingEffect::Unchanged),
269            // invalid ASCII
270            (b"\x80\x81hi", b"\x80\x81HI", CaseFoldingEffect::Modified),
271            // Greek => no ASCII changes
272            (
273                b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
274                b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
275                CaseFoldingEffect::Unchanged,
276            ),
277            // turkic => no ASCII changes
278            (b"\xc4\xb0 \xc4\xb1", b"\xc4\xb0 \xc4\xb1", CaseFoldingEffect::Unchanged),
279            // Chinese => no changes
280            (
281                b"\xe4\xbd\xa0\xe5\xa5\xbd",
282                b"\xe4\xbd\xa0\xe5\xa5\xbd",
283                CaseFoldingEffect::Unchanged,
284            ),
285        ];
286        for (input, expected, effect) in cases {
287            run_test(make_uppercase, input, expected, effect);
288        }
289    }
290
291    // 4) make_swapcase
292    #[test]
293    fn test_make_swapcase() {
294        let cases: [(&[u8], &[u8], CaseFoldingEffect); 10] = [
295            // empty
296            (b"", b"", CaseFoldingEffect::Unchanged),
297            // numeric => no change
298            (b"123", b"123", CaseFoldingEffect::Unchanged),
299            // ASCII letters
300            (b"hEllO", b"HeLLo", CaseFoldingEffect::Modified),
301            (b"HELLO", b"hello", CaseFoldingEffect::Modified),
302            (b"hello", b"HELLO", CaseFoldingEffect::Modified),
303            // spaces => no alpha
304            (b"   \t", b"   \t", CaseFoldingEffect::Unchanged),
305            // invalid ASCII
306            (b"\xffAB", b"\xffab", CaseFoldingEffect::Modified),
307            // Greek => no ASCII changes
308            (
309                b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
310                b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
311                CaseFoldingEffect::Unchanged,
312            ),
313            // turkic => no ASCII changes
314            (b"\xc4\xb0 \xc4\xb1", b"\xc4\xb0 \xc4\xb1", CaseFoldingEffect::Unchanged),
315            // Chinese => no changes
316            (
317                b"\xe4\xbd\xa0\xe5\xa5\xbd",
318                b"\xe4\xbd\xa0\xe5\xa5\xbd",
319                CaseFoldingEffect::Unchanged,
320            ),
321        ];
322        for (input, expected, effect) in cases {
323            run_test(make_swapcase, input, expected, effect);
324        }
325    }
326}