spinoso_string/enc/binascii.rs
1//! Shared encoding and decoding logic for binary and ASCII-8BIT strings.
2
3use crate::case_folding::CaseFoldingEffect;
4
5/// Convert this ASCII or binary string to a "capitalized" form in-place,
6/// returning whether any changes were made.
7///
8/// This routine ensures the first character is upcased, while every subsequent
9/// character is downcased. The function scans only until it detects the first
10/// difference from the current capitalized form:
11///
12/// 1. If the very first byte differs once converted to uppercase, we mark
13/// `CaseFoldingEffect::Changed`, downcase the remainder of the slice in one
14/// go, and return immediately.
15/// 2. If the first byte produces no change, the function delegates to
16/// [`make_lowercase`] for the rest of the slice, again short-circuiting
17/// once a difference is found.
18///
19/// If the entire slice is already in "capitalized" form, this function returns
20/// [`CaseFoldingEffect::Unchanged`].
21///
22/// [`make_lowercase`]: crate::binascii::make_lowercase
23pub fn make_capitalized(s: &mut [u8]) -> CaseFoldingEffect {
24 // Capitalize means:
25 // 1) Upcase the first character
26 // 2) Downcase the rest
27 //
28 // Only do byte equality checks until the first difference is found.
29 // If we find a difference on the first char, we mark `Changed` and
30 // immediately do a no-check downcase on the remainder.
31 // If we find no difference for the first char, then we proceed similarly
32 // for the remainder with the same short-circuit logic.
33
34 // If the buffer is empty => Unchanged
35 let Some((first, s)) = s.split_first_mut() else {
36 return CaseFoldingEffect::Unchanged;
37 };
38
39 // Upcase the first char
40 let old_first = *first;
41 let new_first = old_first.to_ascii_uppercase();
42 *first = new_first;
43
44 if old_first != new_first {
45 // We found a difference in the first char. Now downcase the rest, no
46 // further equality checks.
47 s.make_ascii_lowercase();
48 return CaseFoldingEffect::Modified;
49 }
50
51 // If we get here, the first char had no change. For the remainder of the
52 // string, let's do a single-pass approach and delegate to
53 // `make_lowercase()`.
54 make_lowercase(s)
55}
56
57/// Convert this ASCII or binary string to lowercase in-place, returning
58/// whether any changes were made.
59///
60/// The function scans each byte until it finds a character that actually
61/// needs to be changed (e.g., `A` → `a`). Once we detect the first changed
62/// byte, we call `make_ascii_lowercase()` on the rest of the slice and
63/// immediately return `CaseFoldingEffect::Changed`.
64///
65/// If we reach the end of the string without finding any uppercase byte, we
66/// return `CaseFoldingEffect::Unchanged`.
67pub fn make_lowercase(mut s: &mut [u8]) -> CaseFoldingEffect {
68 loop {
69 // Split off the first byte. If there is no first byte (the slice is
70 // empty), then we've scanned the entire string without encountering a
71 // change, so we return `Unchanged`.
72 let Some((head, tail)) = s.split_first_mut() else {
73 return CaseFoldingEffect::Unchanged;
74 };
75 let old = *head;
76 let new = old.to_ascii_lowercase();
77 s = tail;
78
79 // Overwrite this byte with its lowercase version.
80 *head = new;
81
82 // If this byte actually changed (e.g., old was 'A', new is 'a'), then
83 // for the rest of the string we skip further comparisons and just do
84 // the direct ASCII-lowercasing.
85 if old != new {
86 s.make_ascii_lowercase();
87 return CaseFoldingEffect::Modified;
88 }
89 }
90}
91
92/// Convert this ASCII or binary string to uppercase in-place, returning
93/// whether any changes were made.
94///
95/// The function scans each byte until it finds a character that actually
96/// needs to be changed (e.g., `a` → `A`). Once we detect the first changed
97/// byte, we call `make_ascii_uppercase()` on the rest of the slice and
98/// immediately return `CaseFoldingEffect::Changed`.
99///
100/// If we reach the end of the string without finding any lowercase byte, we
101/// return `CaseFoldingEffect::Unchanged`.
102pub fn make_uppercase(mut s: &mut [u8]) -> CaseFoldingEffect {
103 loop {
104 let Some((head, tail)) = s.split_first_mut() else {
105 return CaseFoldingEffect::Unchanged;
106 };
107 let old = *head;
108 let new = old.to_ascii_uppercase();
109 s = tail;
110
111 *head = new;
112
113 if old != new {
114 s.make_ascii_uppercase();
115 return CaseFoldingEffect::Modified;
116 }
117 }
118}
119
120/// Convert this ASCII or binary string to “swapcase” in-place, returning
121/// whether any changes were made.
122///
123/// “Swapcase” means each ASCII-lowercase byte is converted to uppercase,
124/// and each ASCII-uppercase byte is converted to lowercase; any other
125/// byte is left unchanged. The algorithm short-circuits upon detecting
126/// the first modified byte:
127///
128/// 1. It scans each byte, comparing the swapped version to the original.
129/// 2. Once the first difference is found, the rest of the bytes are
130/// “swapcased” without further equality checks, and
131/// [`CaseFoldingEffect::Changed`] is returned.
132/// 3. If the entire slice is processed with no changes, returns
133/// [`CaseFoldingEffect::Unchanged`].
134pub fn make_swapcase(mut s: &mut [u8]) -> CaseFoldingEffect {
135 #[inline]
136 fn to_swapcase(b: u8) -> u8 {
137 if b.is_ascii_lowercase() {
138 b.to_ascii_uppercase()
139 } else if b.is_ascii_uppercase() {
140 b.to_ascii_lowercase()
141 } else {
142 b
143 }
144 }
145
146 loop {
147 let Some((head, tail)) = s.split_first_mut() else {
148 return CaseFoldingEffect::Unchanged;
149 };
150 let old = *head;
151 let new = to_swapcase(old);
152 s = tail;
153
154 *head = new;
155
156 if *head != old {
157 // We found a difference => do a "no-check" swapcase for the rest
158 for b in s {
159 let old = *b;
160 let new = to_swapcase(old);
161 *b = new;
162 }
163 return CaseFoldingEffect::Modified;
164 }
165 }
166}
167
168#[cfg(test)]
169mod tests {
170 use bstr::ByteSlice;
171
172 use super::*;
173 use crate::case_folding::CaseFoldingEffect;
174
175 #[track_caller]
176 fn run_test<F>(func: F, input: &[u8], expected: &[u8], expect_fold: CaseFoldingEffect)
177 where
178 F: FnOnce(&mut [u8]) -> CaseFoldingEffect,
179 {
180 let mut data = input.to_vec();
181 let fold = func(&mut data);
182 assert_eq!(fold, expect_fold, "CaseFoldingEffect mismatch on {:?}", input.as_bstr());
183 assert_eq!(
184 data.as_bstr(),
185 expected.as_bstr(),
186 "Result bytes mismatch on {:?}",
187 input.as_bstr()
188 );
189 }
190
191 // 1) make_capitalized
192 #[test]
193 fn test_make_capitalized() {
194 // Each test scenario:
195 // input, expected output, expected folding effect
196 let cases: [(&[u8], &[u8], CaseFoldingEffect); 7] = [
197 // empty => no changes
198 (b"", b"", CaseFoldingEffect::Unchanged),
199 // numeric only => no letters to change
200 (b"1234", b"1234", CaseFoldingEffect::Unchanged),
201 // spaces, tabs => no letters
202 (b" \t ", b" \t ", CaseFoldingEffect::Unchanged),
203 // control chars
204 (b"\x01\x02Hello", b"\x01\x02hello", CaseFoldingEffect::Modified),
205 // Actually let's see: first char is \x01, upcase => \x01 no difference, so skip?
206 // But second char is \x02 => no difference. Then 'H' -> 'H' (no difference?),
207 // 'e' -> 'E'(difference). So short-circuit => rest all downcased => "Ello" => "ello"?
208 // Actually we might want to carefully see how "make_capitalized" handles control chars at start...
209 // We'll pick an example that changes after the first alpha:
210 // We'll adapt next lines carefully or define it so it does or doesn't change.
211
212 // invalid ASCII bytes
213 (b"\xFFabc", b"\xFFabc", CaseFoldingEffect::Unchanged),
214 // The first char \xFF is not changed by `to_ascii_uppercase()`,
215 // so no difference => then 'a'->'A'(difference) => short-circuit => bc->"bc"?
216 // => "Abc"? Wait carefully. We'll define the final.
217
218 // We'll do simpler examples for clarity below:
219 (b"hello world", b"Hello world", CaseFoldingEffect::Modified),
220 (b"Hello", b"Hello", CaseFoldingEffect::Unchanged),
221 ];
222
223 for (input, expected, effect) in cases {
224 run_test(make_capitalized, input, expected, effect);
225 }
226 }
227
228 // 2) make_lowercase
229 #[test]
230 fn test_make_lowercase() {
231 let cases: [(&[u8], &[u8], CaseFoldingEffect); 9] = [
232 (b"", b"", CaseFoldingEffect::Unchanged),
233 (b"1234", b"1234", CaseFoldingEffect::Unchanged),
234 (b"HELLO", b"hello", CaseFoldingEffect::Modified),
235 (b"Hello1", b"hello1", CaseFoldingEffect::Modified),
236 // spaces + tab => no alpha
237 (b" \t", b" \t", CaseFoldingEffect::Unchanged),
238 // invalid ASCII
239 (b"\x80\x81HI", b"\x80\x81hi", CaseFoldingEffect::Modified),
240 // Greek bytes: they won't match ASCII 'A'..'Z', so no changes
241 (
242 b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
243 b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
244 CaseFoldingEffect::Unchanged,
245 ),
246 // turkic string with dotted I => no changes in ASCII
247 (b"\xc4\xb0 \xc4\xb1", b"\xc4\xb0 \xc4\xb1", CaseFoldingEffect::Unchanged),
248 // Chinese => no changes
249 (
250 b"\xe4\xbd\xa0\xe5\xa5\xbd",
251 b"\xe4\xbd\xa0\xe5\xa5\xbd",
252 CaseFoldingEffect::Unchanged,
253 ),
254 ];
255 for (input, expected, effect) in cases {
256 run_test(make_lowercase, input, expected, effect);
257 }
258 }
259
260 // 3) make_uppercase
261 #[test]
262 fn test_make_uppercase() {
263 let cases: [(&[u8], &[u8], CaseFoldingEffect); 9] = [
264 (b"", b"", CaseFoldingEffect::Unchanged),
265 (b"1234", b"1234", CaseFoldingEffect::Unchanged),
266 (b"hello", b"HELLO", CaseFoldingEffect::Modified),
267 (b"hEllo2", b"HELLO2", CaseFoldingEffect::Modified),
268 (b" \t", b" \t", CaseFoldingEffect::Unchanged),
269 // invalid ASCII
270 (b"\x80\x81hi", b"\x80\x81HI", CaseFoldingEffect::Modified),
271 // Greek => no ASCII changes
272 (
273 b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
274 b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
275 CaseFoldingEffect::Unchanged,
276 ),
277 // turkic => no ASCII changes
278 (b"\xc4\xb0 \xc4\xb1", b"\xc4\xb0 \xc4\xb1", CaseFoldingEffect::Unchanged),
279 // Chinese => no changes
280 (
281 b"\xe4\xbd\xa0\xe5\xa5\xbd",
282 b"\xe4\xbd\xa0\xe5\xa5\xbd",
283 CaseFoldingEffect::Unchanged,
284 ),
285 ];
286 for (input, expected, effect) in cases {
287 run_test(make_uppercase, input, expected, effect);
288 }
289 }
290
291 // 4) make_swapcase
292 #[test]
293 fn test_make_swapcase() {
294 let cases: [(&[u8], &[u8], CaseFoldingEffect); 10] = [
295 // empty
296 (b"", b"", CaseFoldingEffect::Unchanged),
297 // numeric => no change
298 (b"123", b"123", CaseFoldingEffect::Unchanged),
299 // ASCII letters
300 (b"hEllO", b"HeLLo", CaseFoldingEffect::Modified),
301 (b"HELLO", b"hello", CaseFoldingEffect::Modified),
302 (b"hello", b"HELLO", CaseFoldingEffect::Modified),
303 // spaces => no alpha
304 (b" \t", b" \t", CaseFoldingEffect::Unchanged),
305 // invalid ASCII
306 (b"\xffAB", b"\xffab", CaseFoldingEffect::Modified),
307 // Greek => no ASCII changes
308 (
309 b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
310 b"\xce\x93\xce\xb5\xce\xb9\xce\xac",
311 CaseFoldingEffect::Unchanged,
312 ),
313 // turkic => no ASCII changes
314 (b"\xc4\xb0 \xc4\xb1", b"\xc4\xb0 \xc4\xb1", CaseFoldingEffect::Unchanged),
315 // Chinese => no changes
316 (
317 b"\xe4\xbd\xa0\xe5\xa5\xbd",
318 b"\xe4\xbd\xa0\xe5\xa5\xbd",
319 CaseFoldingEffect::Unchanged,
320 ),
321 ];
322 for (input, expected, effect) in cases {
323 run_test(make_swapcase, input, expected, effect);
324 }
325 }
326}