spinoso_regexp/regexp/regex/utf8/
mod.rs

1use core::fmt;
2use core::str;
3use std::collections::HashSet;
4
5use regex::{Match, Regex, RegexBuilder};
6use scolapasta_string_escape::format_debug_escape_into;
7
8use crate::debug::Debug;
9use crate::encoding::Encoding;
10use crate::error::{ArgumentError, Error, RegexpError, SyntaxError};
11use crate::named_captures::{NamedCapture, NamedCaptures, NamedCapturesForHaystack};
12use crate::{Config, Source};
13
14mod iter;
15
16pub use iter::{CaptureIndices, Captures};
17
18#[derive(Debug, Clone)]
19pub struct Utf8 {
20    source: Source,
21    config: Config,
22    encoding: Encoding,
23    regex: Regex,
24}
25
26impl fmt::Display for Utf8 {
27    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
28        let pattern = self.config.pattern();
29        format_debug_escape_into(f, pattern)?;
30        Ok(())
31    }
32}
33
34impl Utf8 {
35    /// Construct a Regexp with a UTF-8 [`regex`] backend.
36    ///
37    /// The constructed regexp is Unicode aware. All character classes used in
38    /// patterns other than POSIX character classes support all of Unicode.
39    ///
40    /// `Utf8` regexps require their patterns and haystacks to be valid UTF-8.
41    ///
42    /// # Examples
43    ///
44    /// ```
45    /// # use spinoso_regexp::{Config, Encoding, Error, Options, Source, Utf8};
46    /// # fn example() -> Result<(), Error> {
47    /// let pattern = br"[[:alpha:]]\d+ \d+";
48    /// let source = Source::with_pattern_and_options(pattern.to_vec(), Options::default());
49    /// let config = Config::from(&source);
50    /// let regexp = Utf8::with_literal_derived_encoding(source, config, Encoding::None)?;
51    /// assert!(regexp.is_match("a123 १०೩೬".as_bytes(), None)?);
52    /// # Ok(())
53    /// # }
54    /// # example().unwrap()
55    /// ```
56    ///
57    /// # Errors
58    ///
59    /// If the pattern in the given source is not valid UTF-8, an
60    /// [`ArgumentError`] is returned. If the given source pattern fails to
61    /// parse, either a [`SyntaxError`] or [`RegexpError`] is returned depending
62    /// on the source [`Options`].
63    ///
64    /// [`regex`]: regex::Regex
65    /// [`Options`]: crate::Options
66    pub fn with_literal_derived_encoding(source: Source, config: Config, encoding: Encoding) -> Result<Self, Error> {
67        let pattern = str::from_utf8(config.pattern()).map_err(|_| ArgumentError::unsupported_pattern_encoding())?;
68        let mut builder = RegexBuilder::new(pattern);
69        builder.case_insensitive(config.options.ignore_case().is_enabled());
70        builder.multi_line(config.options.multiline().is_enabled());
71        builder.ignore_whitespace(config.options.extended().is_enabled());
72
73        let regex = match builder.build() {
74            Ok(regex) => regex,
75            Err(err) if source.options.is_literal() => {
76                return Err(SyntaxError::from(err.to_string()).into());
77            }
78            Err(err) => return Err(RegexpError::from(err.to_string()).into()),
79        };
80        let regexp = Self {
81            source,
82            config,
83            encoding,
84            regex,
85        };
86        Ok(regexp)
87    }
88
89    /// # Errors
90    ///
91    /// If the given haystack is not valid UTF-8, an error is returned.
92    pub fn captures<'a>(&self, haystack: &'a [u8]) -> Result<Option<Captures<'a>>, Error> {
93        let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
94        Ok(self.regex.captures(haystack).map(Captures::from))
95    }
96
97    pub fn capture_indices_for_name<'a, 'b>(&'a self, name: &'b [u8]) -> CaptureIndices<'a, 'b> {
98        CaptureIndices::with_name_and_iter(name, self.regex.capture_names())
99    }
100
101    /// Returns the number of captures.
102    #[must_use]
103    pub fn captures_len(&self) -> usize {
104        self.regex.captures_len()
105    }
106
107    /// The number of captures for a match of `haystack` against this regexp.
108    ///
109    /// Captures represents a group of captured strings for a single match.
110    ///
111    /// If there is a match, the returned value is always greater than 0; the
112    /// 0th capture always corresponds to the entire match.
113    ///
114    /// # Errors
115    ///
116    /// If the given haystack is not valid UTF-8, an error is returned.
117    pub fn capture_count_for_haystack(&self, haystack: &[u8]) -> Result<usize, ArgumentError> {
118        let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
119        if let Some(captures) = self.regex.captures(haystack) {
120            Ok(captures.len())
121        } else {
122            Ok(0)
123        }
124    }
125
126    /// Return the 0th capture group if `haystack` is matched by this regexp.
127    ///
128    /// The 0th capture always corresponds to the entire match.
129    ///
130    /// # Errors
131    ///
132    /// If the given haystack is not valid UTF-8, an error is returned.
133    pub fn entire_match<'a>(&self, haystack: &'a [u8]) -> Result<Option<&'a [u8]>, Error> {
134        let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
135        if let Some(captures) = self.regex.captures(haystack) {
136            let entire_match = captures.get(0);
137            Ok(entire_match.as_ref().map(Match::as_str).map(str::as_bytes))
138        } else {
139            Ok(None)
140        }
141    }
142
143    /// Returns a hash representing information about the named captures of this
144    /// `Regexp`.
145    ///
146    /// A key of the hash is a name of the named captures. A value of the hash
147    /// is an array which is list of indexes of corresponding named captures.
148    pub fn named_captures(&self) -> NamedCaptures {
149        // Use a Vec of key-value pairs because insertion order matters for spec
150        // compliance.
151        let mut map = vec![];
152        for group in self.regex.capture_names().flatten() {
153            let indices = self.capture_indices_for_name(group.as_bytes()).collect::<Vec<_>>();
154            if !indices.is_empty() {
155                map.push(NamedCapture::new(group.into(), indices));
156            }
157        }
158        map.into()
159    }
160
161    /// # Errors
162    ///
163    /// If the given haystack is not valid UTF-8, an error is returned.
164    pub fn named_captures_for_haystack(&self, haystack: &[u8]) -> Result<Option<NamedCapturesForHaystack>, Error> {
165        let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
166        let captures = if let Some(captures) = self.regex.captures(haystack) {
167            captures
168        } else {
169            return Ok(None);
170        };
171        let mut map = NamedCapturesForHaystack::with_capacity(captures.len());
172        for named_capture in self.named_captures() {
173            let (group, indices) = named_capture.into_group_and_indices();
174            let capture = indices.iter().rev().copied().find_map(|index| captures.get(index));
175            if let Some(capture) = capture {
176                map.insert(group, Some(capture.as_str().into()));
177            } else {
178                map.insert(group, None);
179            }
180        }
181        Ok(Some(map))
182    }
183
184    #[must_use]
185    pub fn names(&self) -> Vec<Vec<u8>> {
186        let mut names = vec![];
187        let mut capture_names = self.named_captures().collect::<Vec<_>>();
188        capture_names.sort_by(|left, right| {
189            let left = left.indices().iter().min().copied().unwrap_or(usize::MAX);
190            let right = right.indices().iter().min().copied().unwrap_or(usize::MAX);
191            left.cmp(&right)
192        });
193        let mut set = HashSet::with_capacity(capture_names.len());
194        for cn in capture_names {
195            let name = cn.into_group();
196            if set.contains(&name) {
197                continue;
198            }
199            names.push(name.clone());
200            set.insert(name);
201        }
202        names
203    }
204
205    /// # Errors
206    ///
207    /// If the given haystack is not valid UTF-8, an error is returned.
208    pub fn pos(&self, haystack: &[u8], at: usize) -> Result<Option<(usize, usize)>, Error> {
209        let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
210        let pos = self
211            .regex
212            .captures(haystack)
213            .and_then(|captures| captures.get(at))
214            .map(|match_pos| (match_pos.start(), match_pos.end()));
215        Ok(pos)
216    }
217
218    /// Check whether this regexp matches the given haystack starting at an offset.
219    ///
220    /// If the given offset is negative, it counts backward from the end of the
221    /// haystack.
222    ///
223    /// # Errors
224    ///
225    /// If the given haystack is not valid UTF-8, an error is returned.
226    pub fn is_match(&self, haystack: &[u8], pos: Option<i64>) -> Result<bool, Error> {
227        let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
228        let haystack_char_len = haystack.chars().count();
229        let pos = pos.unwrap_or_default();
230        let pos = if let Some(pos) = scolapasta_aref::offset_to_index(pos, haystack_char_len) {
231            pos
232        } else {
233            return Ok(false);
234        };
235        let offset = haystack.chars().take(pos).map(char::len_utf8).sum();
236        let haystack = &haystack[offset..];
237        Ok(self.regex.find(haystack).is_some())
238    }
239
240    pub fn debug(&self) -> Debug<'_> {
241        Debug::new(
242            self.source.pattern(),
243            self.source.options.as_display_modifier(),
244            self.encoding.as_modifier_str(),
245        )
246    }
247
248    #[must_use]
249    pub fn is_literal(&self) -> bool {
250        self.source.options().is_literal()
251    }
252
253    #[must_use]
254    pub fn source(&self) -> &Source {
255        &self.source
256    }
257
258    #[must_use]
259    pub fn config(&self) -> &Config {
260        &self.config
261    }
262
263    #[must_use]
264    pub fn encoding(&self) -> Encoding {
265        self.encoding
266    }
267
268    #[must_use]
269    pub fn string(&self) -> &[u8] {
270        self.config.pattern()
271    }
272}
273
274#[cfg(test)]
275mod tests {
276    use bstr::{B, ByteSlice};
277
278    use super::Utf8;
279    use crate::{Config, Encoding, Error, Flags, Options, Source};
280
281    fn make(pattern: impl AsRef<[u8]>, options: Option<Options>, encoding: Encoding) -> Utf8 {
282        let source = Source::with_pattern_and_options(pattern.as_ref().to_vec(), options.unwrap_or_default());
283        let config = Config::from(&source);
284        Utf8::with_literal_derived_encoding(source, config, encoding).unwrap()
285    }
286
287    #[test]
288    fn can_compile_posix_character_classes() {
289        let regexp = make("[[:digit:]][[:space:]][[:alpha:]][[:punct:]]", None, Encoding::None);
290        assert!(regexp.is_match(b"1 a&", None).unwrap());
291    }
292
293    #[test]
294    fn can_compile_perl_unicode_patterns() {
295        let regexp = make(r"\d+ \d+", None, Encoding::None);
296        // This haystack contains non-ASCII numerals in the Unicode Nd character
297        // class. The sequence contains Devanagari 1, Devanagari 0, Kannada 3,
298        // and Kannada 6.
299        //
300        // See:
301        //
302        // - <https://en.wikipedia.org/wiki/Devanagari_numerals#Table>
303        // - <https://en.wikipedia.org/wiki/Kannada_script#Numerals>
304        let haystack = "123 १०೩೬";
305        assert!(regexp.is_match(haystack.as_bytes(), None).unwrap());
306    }
307
308    #[test]
309    fn requires_utf8_encoding_for_pattern() {
310        let source = Source::with_pattern_and_options(b"abc \xFF\xFE 123".to_vec(), Options::default());
311        let config = Config::from(&source);
312        let err = Utf8::with_literal_derived_encoding(source, config, Encoding::None).unwrap_err();
313        assert!(matches!(err, Error::Argument(err) if err.message() == "Unsupported pattern encoding"));
314    }
315
316    #[test]
317    fn invalid_pattern_is_syntax_error_for_literal() {
318        let options = Options::from(Flags::LITERAL);
319        let source = Source::with_pattern_and_options(b"[".to_vec(), options);
320        let config = Config::from(&source);
321        let err = Utf8::with_literal_derived_encoding(source, config, Encoding::None).unwrap_err();
322        assert!(matches!(err, Error::Syntax(..)));
323    }
324
325    #[test]
326    fn invalid_pattern_is_syntax_error_for_compiled() {
327        let options = Options::from(Flags::ALL_REGEXP_OPTS);
328        let source = Source::with_pattern_and_options(b"[".to_vec(), options);
329        let config = Config::from(&source);
330        let err = Utf8::with_literal_derived_encoding(source, config, Encoding::None).unwrap_err();
331        assert!(matches!(err, Error::Regexp(..)));
332    }
333
334    #[test]
335    fn literal_pattern_backrefs_are_not_supported() {
336        let options = Options::from(Flags::LITERAL);
337        let source = Source::with_pattern_and_options(br"\0".to_vec(), options);
338        let config = Config::from(&source);
339        let err = Utf8::with_literal_derived_encoding(source, config, Encoding::None).unwrap_err();
340        assert!(matches!(err, Error::Syntax(err) if err.message().contains("backreferences are not supported")));
341    }
342
343    #[test]
344    fn compiled_pattern_backrefs_are_not_supported() {
345        let options = Options::from(Flags::ALL_REGEXP_OPTS);
346        let source = Source::with_pattern_and_options(br"\0".to_vec(), options);
347        let config = Config::from(&source);
348        let err = Utf8::with_literal_derived_encoding(source, config, Encoding::None).unwrap_err();
349        assert!(matches!(err, Error::Regexp(err) if err.message().contains("backreferences are not supported")));
350    }
351
352    #[test]
353    fn is_literal() {
354        let options = Options::from(Flags::LITERAL);
355        let regexp = make("abc", Some(options), Encoding::None);
356        assert!(regexp.is_literal());
357
358        let options = Options::from(Flags::empty());
359        let regexp = make("abc", Some(options), Encoding::None);
360        assert!(!regexp.is_literal());
361
362        let options = Options::from(Flags::ALL_REGEXP_OPTS);
363        let regexp = make("abc", Some(options), Encoding::None);
364        assert!(!regexp.is_literal());
365
366        let regexp = make("abc", None, Encoding::None);
367        assert!(!regexp.is_literal());
368    }
369
370    #[test]
371    fn string() {
372        let test_cases = [
373            ("abc", B("abc")),
374            ("xyz", B("xyz")),
375            ("🦀", B("🦀")),
376            ("铁锈", B("铁锈")),
377        ];
378        for (pattern, string) in test_cases {
379            let regexp = make(pattern, None, Encoding::None);
380            assert_eq!(
381                regexp.string().as_bstr(),
382                string.as_bstr(),
383                "Mismatched string for pattern"
384            );
385        }
386    }
387
388    #[test]
389    fn fmt_display() {
390        let test_cases = [
391            (B("abc"), "abc"),
392            (B("xyz"), "xyz"),
393            (B("🦀"), "🦀"),
394            (B("铁锈"), "铁锈"),
395            // Invalid UTF-8 patterns are not supported 👇
396            // ```
397            // (B(b"\xFF\xFE"), r"\xFF\xFE"),
398            // (B(b"abc \xFF\xFE xyz"), r"abc \xFF\xFE xyz"),
399            // ```
400        ];
401        for (pattern, display) in test_cases {
402            let regexp = make(pattern, None, Encoding::None);
403            assert_eq!(regexp.to_string(), display, "Mismatched display impl for pattern");
404        }
405    }
406
407    #[test]
408    fn debug() {
409        let test_cases = [
410            (B("\0"), r"/\x00/", Options::default()),
411            (B("\0"), r"/\x00/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
412            (B("\0"), r"/\x00/ix", Options::from(Flags::IGNORECASE | Flags::EXTENDED)),
413            (B("\0"), r"/\x00/m", Options::from(Flags::MULTILINE)),
414            (B(b"\x0a"), "/\n/", Options::default()),
415            (B("\x0B"), "/\x0B/", Options::default()),
416            // NOTE: the control characters, not a raw string, are in the debug output.
417            (B("\n\r\t"), "/\n\r\t/", Options::default()),
418            (B("\n\r\t"), "/\n\r\t/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
419            (
420                B("\n\r\t"),
421                "/\n\r\t/ix",
422                Options::from(Flags::IGNORECASE | Flags::EXTENDED),
423            ),
424            (B("\n\r\t"), "/\n\r\t/m", Options::from(Flags::MULTILINE)),
425            (B("\x7F"), r"/\x7F/", Options::default()),
426            (B("\x7F"), r"/\x7F/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
427            (
428                B("\x7F"),
429                r"/\x7F/ix",
430                Options::from(Flags::IGNORECASE | Flags::EXTENDED),
431            ),
432            (B("\x7F"), r"/\x7F/m", Options::from(Flags::MULTILINE)),
433            (B(r"\a"), r"/\a/", Options::default()),
434            (B(r"\a"), r"/\a/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
435            (B(r"\a"), r"/\a/ix", Options::from(Flags::IGNORECASE | Flags::EXTENDED)),
436            (B(r"\a"), r"/\a/m", Options::from(Flags::MULTILINE)),
437            (B("abc"), "/abc/", Options::default()),
438            (B("abc"), "/abc/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
439            (B("abc"), "/abc/ix", Options::from(Flags::IGNORECASE | Flags::EXTENDED)),
440            (B("abc"), "/abc/m", Options::from(Flags::MULTILINE)),
441            (B("a+b*c"), "/a+b*c/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
442            (B("xyz"), "/xyz/", Options::default()),
443            (B("xyz"), "/xyz/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
444            (B("xyz"), "/xyz/ix", Options::from(Flags::IGNORECASE | Flags::EXTENDED)),
445            (B("xyz"), "/xyz/m", Options::from(Flags::MULTILINE)),
446            (B("x+y*z"), "/x+y*z/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
447            (B("🦀💎"), "/🦀💎/", Options::default()),
448            (B("🦀💎"), "/🦀💎/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
449            (
450                B("🦀💎"),
451                "/🦀💎/ix",
452                Options::from(Flags::IGNORECASE | Flags::EXTENDED),
453            ),
454            (B("🦀💎"), "/🦀💎/m", Options::from(Flags::MULTILINE)),
455            (B("🦀+💎*"), "/🦀+💎*/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
456            (B("铁锈"), "/铁锈/", Options::default()),
457            (B("铁锈"), "/铁锈/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
458            (
459                B("铁锈"),
460                "/铁锈/ix",
461                Options::from(Flags::IGNORECASE | Flags::EXTENDED),
462            ),
463            (B("铁锈"), "/铁锈/m", Options::from(Flags::MULTILINE)),
464            (B("铁+锈*"), "/铁+锈*/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
465            // Invalid UTF-8 patterns are not supported 👇
466            // ```
467            // (B(b"\xFF\xFE"), r"\xFF\xFE", Options::default()),
468            // (B(b"abc \xFF\xFE xyz"), r"abc \xFF\xFE xyz", Options::default()),
469            // ```
470        ];
471        for (pattern, debug, options) in test_cases {
472            let regexp = make(pattern, Some(options), Encoding::None);
473            assert_eq!(
474                regexp.debug().collect::<String>(),
475                debug,
476                "Mismatched debug iterator for pattern"
477            );
478        }
479    }
480}