1use alloc::vec::Vec;
2
3use bstr::ByteVec;
4
5use crate::case_folding::CaseFoldingEffect;
6
7pub fn to_utf8_capitalized(mut bytes: &[u8]) -> (Vec<u8>, CaseFoldingEffect) {
21 let mut replacement = Vec::with_capacity(bytes.len());
27 let mut effect = CaseFoldingEffect::Unchanged;
28
29 let (ch, size) = bstr::decode_utf8(bytes);
30 let (chunk, remainder) = unsafe { bytes.split_at_unchecked(size) };
32 bytes = remainder;
33
34 if let Some(ch) = ch {
35 let old = ch;
38 for ch in ch.to_uppercase() {
39 if ch != old {
40 effect = CaseFoldingEffect::Modified;
41 }
42 replacement.push_char(ch);
43 }
44 } else {
45 replacement.extend_from_slice(chunk);
46 }
47
48 while !bytes.is_empty() {
49 let (ch, size) = bstr::decode_utf8(bytes);
50 let (chunk, remainder) = unsafe { bytes.split_at_unchecked(size) };
52 bytes = remainder;
53
54 if let Some(ch) = ch {
55 let old = ch;
58 for ch in ch.to_lowercase() {
59 if ch != old {
60 effect = CaseFoldingEffect::Modified;
61 }
62 replacement.push_char(ch);
63 }
64 } else {
65 replacement.extend_from_slice(chunk);
66 }
67 }
68
69 (replacement, effect)
70}
71
72pub fn to_utf8_lowercase(mut bytes: &[u8]) -> (Vec<u8>, CaseFoldingEffect) {
78 let mut replacement = Vec::with_capacity(bytes.len());
84 let mut effect = CaseFoldingEffect::Unchanged;
85
86 while !bytes.is_empty() {
87 let (ch, size) = bstr::decode_utf8(bytes);
89
90 let (chunk, remainder) = unsafe { bytes.split_at_unchecked(size) };
92 bytes = remainder;
93
94 let Some(ch) = ch else {
95 replacement.extend_from_slice(chunk);
97 continue;
98 };
99 let old = ch;
102 for ch in old.to_lowercase() {
103 if ch != old {
104 effect = CaseFoldingEffect::Modified;
105 }
106 replacement.push_char(ch);
107 }
108 }
109
110 (replacement, effect)
111}
112
113pub fn to_utf8_uppercase(mut bytes: &[u8]) -> (Vec<u8>, CaseFoldingEffect) {
119 let mut replacement = Vec::with_capacity(bytes.len());
125 let mut effect = CaseFoldingEffect::Unchanged;
126
127 while !bytes.is_empty() {
128 let (ch, size) = bstr::decode_utf8(bytes);
130
131 let (chunk, remainder) = unsafe { bytes.split_at_unchecked(size) };
133 bytes = remainder;
134
135 let Some(ch) = ch else {
136 replacement.extend_from_slice(chunk);
138 continue;
139 };
140 let old = ch;
143 for ch in old.to_uppercase() {
144 if ch != old {
145 effect = CaseFoldingEffect::Modified;
146 }
147 replacement.push_char(ch);
148 }
149 }
150
151 (replacement, effect)
152}
153
154pub fn to_utf8_swapcase(mut bytes: &[u8]) -> (Vec<u8>, CaseFoldingEffect) {
164 let mut replacement = Vec::with_capacity(bytes.len());
165 let mut effect = CaseFoldingEffect::Unchanged;
166
167 while !bytes.is_empty() {
168 let (ch, size) = bstr::decode_utf8(bytes);
169 let (chunk, remainder) = unsafe { bytes.split_at_unchecked(size) };
171 bytes = remainder;
172
173 let Some(ch) = ch else {
174 replacement.extend_from_slice(chunk);
177 continue;
178 };
179
180 match ch {
186 old if old.is_lowercase() => {
187 for ch in old.to_uppercase() {
188 if ch != old {
189 effect = CaseFoldingEffect::Modified;
190 }
191 replacement.push_char(ch);
192 }
193 }
194 old if old.is_uppercase() => {
195 for ch in old.to_lowercase() {
196 if ch != old {
197 effect = CaseFoldingEffect::Modified;
198 }
199 replacement.push_char(ch);
200 }
201 }
202 old => replacement.push_char(old),
203 }
204 }
205
206 (replacement, effect)
207}
208
209#[cfg(test)]
210mod tests {
211
212 use bstr::ByteSlice;
213
214 use super::*;
215 use crate::case_folding::CaseFoldingEffect;
216
217 #[track_caller]
219 fn run_test<F>(func: F, input: &[u8], expected: &[u8], expect_fold: CaseFoldingEffect)
220 where
221 F: FnOnce(&[u8]) -> (Vec<u8>, CaseFoldingEffect),
222 {
223 let (output, effect) = func(input);
224 assert_eq!(
225 effect,
226 expect_fold,
227 "Expected folding effect {:?} for input {:?}, got {:?} with output {:?}",
228 expect_fold,
229 input.as_bstr(),
230 effect,
231 output.as_bstr(),
232 );
233
234 assert_eq!(
235 output,
236 expected,
237 "Transformed result mismatch:\n input = {:?}\n expected = {:?}\n actual = {:?}",
238 input.as_bstr(),
239 expected.as_bstr(),
240 output.as_bstr()
241 );
242 }
243
244 #[test]
245 fn test_to_utf8_capitalized() {
246 let cases: [(&[u8], &[u8], CaseFoldingEffect); 9] = [
247 (b"", b"", CaseFoldingEffect::Unchanged),
249 (b"hello WORLD", b"Hello world", CaseFoldingEffect::Modified),
251 (b"Hello world", b"Hello world", CaseFoldingEffect::Unchanged),
252 (b"1234", b"1234", CaseFoldingEffect::Unchanged),
253 (b"\xFFabc", b"\xFFabc", CaseFoldingEffect::Unchanged),
255 ("ßtest".as_bytes(), b"SStest", CaseFoldingEffect::Modified),
258 ("αγαπώ".as_bytes(), "Αγαπώ".as_bytes(), CaseFoldingEffect::Modified),
260 ("işaret".as_bytes(), "Işaret".as_bytes(), CaseFoldingEffect::Modified),
262 (
264 "你好世界".as_bytes(),
265 "你好世界".as_bytes(),
266 CaseFoldingEffect::Unchanged,
267 ),
268 ];
269 for (input, expected, effect) in cases {
270 run_test(to_utf8_capitalized, input, expected, effect);
271 }
272 }
273
274 #[test]
275 fn test_to_utf8_capitalized_dz_digraph() {
276 let cases: [(&[u8], &[u8], CaseFoldingEffect); 3] = [
277 ("DŽ".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Unchanged),
278 ("Dž".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Modified),
279 ("dž".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Modified),
280 ];
281 for (input, expected, effect) in cases {
282 run_test(to_utf8_capitalized, input, expected, effect);
283 }
284 }
285
286 #[test]
287 fn test_to_utf8_lowercase() {
288 let cases: [(&[u8], &[u8], CaseFoldingEffect); 9] = [
289 (b"", b"", CaseFoldingEffect::Unchanged),
290 (b"HELLO", b"hello", CaseFoldingEffect::Modified),
291 ("ßTEST".as_bytes(), "ßtest".as_bytes(), CaseFoldingEffect::Modified),
293 (b"\xFFhello", b"\xFFhello", CaseFoldingEffect::Unchanged),
294 (b"Hello world", b"hello world", CaseFoldingEffect::Modified),
295 (b"hello world", b"hello world", CaseFoldingEffect::Unchanged),
296 ("İŞARET".as_bytes(), "i̇şaret".as_bytes(), CaseFoldingEffect::Modified),
298 ("你好".as_bytes(), "你好".as_bytes(), CaseFoldingEffect::Unchanged),
299 ("ΑΓΑΠΩ".as_bytes(), "αγαπω".as_bytes(), CaseFoldingEffect::Modified),
300 ];
301 for (input, expected, effect) in cases {
302 run_test(to_utf8_lowercase, input, expected, effect);
303 }
304 }
305
306 #[test]
307 fn test_to_utf8_lowercase_dz_digraph() {
308 let cases: [(&[u8], &[u8], CaseFoldingEffect); 3] = [
309 ("DŽ".as_bytes(), "dž".as_bytes(), CaseFoldingEffect::Modified),
310 ("Dž".as_bytes(), "dž".as_bytes(), CaseFoldingEffect::Modified),
311 ("dž".as_bytes(), "dž".as_bytes(), CaseFoldingEffect::Unchanged),
312 ];
313 for (input, expected, effect) in cases {
314 run_test(to_utf8_lowercase, input, expected, effect);
315 }
316 }
317
318 #[test]
319 fn test_to_utf8_uppercase() {
320 let cases: [(&[u8], &[u8], CaseFoldingEffect); 8] = [
321 (b"", b"", CaseFoldingEffect::Unchanged),
322 (b"hello", b"HELLO", CaseFoldingEffect::Modified),
323 ("ßtest".as_bytes(), b"SSTEST", CaseFoldingEffect::Modified),
324 (b"hello world", b"HELLO WORLD", CaseFoldingEffect::Modified),
325 (b"HELLO", b"HELLO", CaseFoldingEffect::Unchanged),
326 ("işaret".as_bytes(), "IŞARET".as_bytes(), CaseFoldingEffect::Modified),
328 ("你好".as_bytes(), "你好".as_bytes(), CaseFoldingEffect::Unchanged),
329 ("αγαπώ".as_bytes(), "ΑΓΑΠΏ".as_bytes(), CaseFoldingEffect::Modified),
331 ];
332 for (input, expected, effect) in cases {
333 run_test(to_utf8_uppercase, input, expected, effect);
334 }
335 }
336
337 #[test]
338 fn test_to_utf8_uppercase_dz_digraph() {
339 let cases: [(&[u8], &[u8], CaseFoldingEffect); 3] = [
340 ("DŽ".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Unchanged),
341 ("Dž".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Modified),
342 ("dž".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Modified),
343 ];
344 for (input, expected, effect) in cases {
345 run_test(to_utf8_uppercase, input, expected, effect);
346 }
347 }
348
349 #[test]
350 fn test_to_utf8_swapcase() {
351 let cases: [(&[u8], &[u8], CaseFoldingEffect); 8] = [
352 (b"", b"", CaseFoldingEffect::Unchanged),
353 (b"hEllo", b"HeLLO", CaseFoldingEffect::Modified),
355 (b"1234", b"1234", CaseFoldingEffect::Unchanged),
356 ("ßTEST".as_bytes(), "SStest".as_bytes(), CaseFoldingEffect::Modified),
358 (b"\xFFabc", b"\xFFABC", CaseFoldingEffect::Modified),
359 ("iŞARET".as_bytes(), "Işaret".as_bytes(), CaseFoldingEffect::Modified),
361 ("你好".as_bytes(), "你好".as_bytes(), CaseFoldingEffect::Unchanged),
363 ("αγΑΠΏ".as_bytes(), "ΑΓαπώ".as_bytes(), CaseFoldingEffect::Modified),
365 ];
366 for (input, expected, effect) in cases {
367 run_test(to_utf8_swapcase, input, expected, effect);
368 }
369 }
370
371 #[test]
372 #[should_panic = r#"Expected folding effect Modified for input "Dž", got Unchanged with output "Dž""#]
375 fn test_to_utf8_swapcase_dz_digraph() {
376 let cases: [(&[u8], &[u8], CaseFoldingEffect); 4] = [
377 ("DŽ".as_bytes(), "dž".as_bytes(), CaseFoldingEffect::Modified),
378 ("Dž".as_bytes(), "dŽ".as_bytes(), CaseFoldingEffect::Modified),
379 ("dŽ".as_bytes(), "Dž".as_bytes(), CaseFoldingEffect::Modified),
380 ("dž".as_bytes(), "DŽ".as_bytes(), CaseFoldingEffect::Modified),
381 ];
382 for (input, expected, effect) in cases {
383 run_test(to_utf8_swapcase, input, expected, effect);
384 }
385 }
386}