spinoso_string/enc/utf8/borrowed/
codepoints.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
use core::str::Chars;

use super::Utf8Str;
use crate::CodepointsError;

#[derive(Debug, Clone)]
pub struct Codepoints<'a> {
    inner: Chars<'a>,
}

impl<'a> TryFrom<&'a Utf8Str> for Codepoints<'a> {
    type Error = CodepointsError;

    #[inline]
    fn try_from(s: &'a Utf8Str) -> Result<Self, Self::Error> {
        match simdutf8::basic::from_utf8(s.as_bytes()) {
            Ok(s) => Ok(Self { inner: s.chars() }),
            // ```
            // [3.2.2] > s = "abc\xFFxyz"
            // => "abc\xFFxyz"
            // [3.2.2] > s.encoding
            // => #<Encoding:UTF-8>
            // [3.2.2] > s.codepoints
            // (irb):5:in `codepoints': invalid byte sequence in UTF-8 (ArgumentError)
            //         from (irb):5:in `<main>'
            //         from /usr/local/var/rbenv/versions/3.2.2/lib/ruby/gems/3.2.0/gems/irb-1.6.2/exe/irb:11:in `<top (required)>'
            //         from /usr/local/var/rbenv/versions/3.2.2/bin/irb:25:in `load'
            //         from /usr/local/var/rbenv/versions/3.2.2/bin/irb:25:in `<main>'
            // ```
            Err(_) => Err(CodepointsError::invalid_utf8_codepoint()),
        }
    }
}

impl Iterator for Codepoints<'_> {
    type Item = u32;

    fn next(&mut self) -> Option<Self::Item> {
        self.inner.next().map(u32::from)
    }
}

impl Default for Codepoints<'_> {
    #[inline]
    fn default() -> Self {
        Self { inner: "".chars() }
    }
}

#[cfg(test)]
mod tests {
    use alloc::vec::Vec;

    use super::*;

    #[test]
    fn test_valid_utf8() {
        let s = Utf8Str::new("hellođź’Ž");
        let codepoints = Codepoints::try_from(s).unwrap();
        assert_eq!(codepoints.collect::<Vec<_>>(), &[104, 101, 108, 108, 111, 128_142]);
    }

    #[test]
    fn test_invalid_utf8() {
        let s = Utf8Str::new(b"hello\xFF");
        let err = Codepoints::try_from(s).unwrap_err();
        assert_eq!(err, CodepointsError::invalid_utf8_codepoint());
    }
}