artichoke_backend/extn/core/regexp/
pattern.rs

1//! Regexp pattern parsers.
2
3use core::iter;
4
5use bstr::ByteSlice;
6
7use super::{Flags, Options, RegexpOption};
8
9/// A Regexp pattern including its derived `Options`.
10#[derive(Default, Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
11pub struct Pattern {
12    pattern: Vec<u8>,
13    options: Options,
14}
15
16impl Pattern {
17    /// Consume self and return the inner pattern byte vector.
18    #[must_use]
19    pub fn into_pattern(self) -> Vec<u8> {
20        self.pattern
21    }
22
23    /// Return the `Options` parsed when constructing this `Pattern`.
24    #[must_use]
25    pub const fn options(&self) -> Options {
26        self.options
27    }
28}
29
30#[inline]
31#[must_use]
32fn build_pattern<T>(pattern: T, options: Options) -> Pattern
33where
34    T: IntoIterator<Item = u8>,
35{
36    let iter = pattern.into_iter();
37    let hint = iter.size_hint();
38    let modifiers = options.as_inline_modifier();
39    let mut parsed = Vec::with_capacity(2 + modifiers.len() + 2 + hint.1.unwrap_or(hint.0));
40    parsed.extend_from_slice(b"(?");
41    parsed.extend_from_slice(modifiers.as_bytes());
42    parsed.push(b':');
43    parsed.extend(iter);
44    parsed.push(b')');
45    Pattern {
46        pattern: parsed,
47        options,
48    }
49}
50
51#[must_use]
52pub fn parse<T: AsRef<[u8]>>(pattern: T, options: Options) -> Pattern {
53    let pattern = pattern.as_ref();
54    let mut chars = pattern.bytes().enumerate();
55
56    match chars.next() {
57        Some((_, b'(')) => {}
58        Some(_) => return build_pattern(pattern.bytes(), options),
59        None => return build_pattern(iter::empty(), options),
60    }
61    match chars.next() {
62        Some((_, b'?')) => {}
63        Some(_) => return build_pattern(pattern.bytes(), options),
64        None => return build_pattern(iter::once(b'('), options),
65    }
66
67    let orignal_options = options;
68    let mut options = options;
69    let mut enable_literal_option = RegexpOption::Enabled;
70
71    for (_, token) in &mut chars {
72        match token {
73            b'-' => enable_literal_option = RegexpOption::Disabled,
74            b'i' => {
75                options.set(Flags::IGNORECASE, enable_literal_option.into());
76            }
77            b'm' => {
78                options.set(Flags::MULTILINE, enable_literal_option.into());
79            }
80            b'x' => {
81                options.set(Flags::EXTENDED, enable_literal_option.into());
82            }
83            b':' => break,
84            _ => return build_pattern(pattern.bytes(), options),
85        }
86    }
87
88    let mut chars = chars.peekable();
89    let cursor = if let Some((idx, _)) = chars.peek() {
90        *idx
91    } else {
92        pattern.len()
93    };
94
95    let mut nest = 1;
96    while let Some((_, token)) = chars.next() {
97        if token == b'(' {
98            nest += 1;
99        } else if token == b')' {
100            nest -= 1;
101            if nest == 0 && chars.next().is_some() {
102                return build_pattern(pattern.bytes(), orignal_options);
103            }
104            break;
105        }
106    }
107
108    let slice = pattern.get(cursor..).unwrap_or_default();
109    let modifiers = options.as_inline_modifier();
110    let mut parsed = Vec::with_capacity(2 + modifiers.len() + 1 + slice.len());
111    parsed.extend(b"(?");
112    parsed.extend(modifiers.as_bytes());
113    parsed.push(b':');
114    parsed.extend_from_slice(slice);
115    Pattern {
116        pattern: parsed,
117        options,
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use bstr::BString;
124
125    use crate::extn::core::regexp::{Flags, Options};
126
127    #[test]
128    fn parse_literal_string_pattern() {
129        let opts = Options::new();
130        let parsed = super::parse("foo", opts);
131        assert_eq!(BString::from("(?-mix:foo)"), BString::from(parsed.into_pattern()));
132    }
133
134    // The below tests are extracted from `Regexp#to_s` ruby/specs.
135
136    #[test]
137    fn parse_options_if_included_and_expand() {
138        let opts = Options::from(Flags::ALL_REGEXP_OPTS);
139        let parsed = super::parse("abc", opts);
140        assert_eq!(BString::from("(?mix:abc)"), BString::from(parsed.into_pattern()),);
141    }
142
143    #[test]
144    fn parse_non_included_options_and_embed_expanded_modifiers_prefixed_by_a_minus_sign() {
145        let opts = Options::from(Flags::IGNORECASE);
146        let parsed = super::parse("abc", opts);
147        assert_eq!(BString::from("(?i-mx:abc)"), BString::from(parsed.into_pattern()),);
148    }
149
150    #[test]
151    fn parse_patterns_with_no_enabled_options_and_expand_with_all_modifiers_excluded() {
152        let opts = Options::new();
153        let parsed = super::parse("abc", opts);
154        assert_eq!(BString::from("(?-mix:abc)"), BString::from(parsed.into_pattern()),);
155    }
156
157    #[test]
158    fn embeds_the_pattern_after_the_options_after_parsing() {
159        let opts = Options::from(Flags::ALL_REGEXP_OPTS);
160        let parsed = super::parse("ab+c", opts);
161        assert_eq!(BString::from("(?mix:ab+c)"), BString::from(parsed.into_pattern()),);
162        let opts = Options::new();
163        let parsed = super::parse("xyz", opts);
164        assert_eq!(BString::from("(?-mix:xyz)"), BString::from(parsed.into_pattern()),);
165    }
166
167    #[test]
168    fn parse_groups_with_options() {
169        let opts = Options::new();
170        let parsed = super::parse("(?ix:foo)(?m:bar)", opts);
171        assert_eq!(
172            BString::from("(?-mix:(?ix:foo)(?m:bar))"),
173            BString::from(parsed.into_pattern()),
174        );
175        let opts = Options::from(Flags::MULTILINE);
176        let parsed = super::parse("(?ix:foo)bar", opts);
177        assert_eq!(
178            BString::from("(?m-ix:(?ix:foo)bar)"),
179            BString::from(parsed.into_pattern()),
180        );
181    }
182
183    #[test]
184    fn parse_a_single_group_with_options_as_the_main_regexp() {
185        let opts = Options::new();
186        let parsed = super::parse("(?i:nothing outside this group)", opts);
187        assert_eq!(
188            BString::from("(?i-mx:nothing outside this group)"),
189            BString::from(parsed.into_pattern())
190        );
191    }
192
193    #[test]
194    fn parse_uncaptured_groups() {
195        let opts = Options::from(Flags::IGNORECASE | Flags::EXTENDED);
196        let parsed = super::parse("whatever(?:0d)", opts);
197        assert_eq!(
198            BString::from("(?ix-m:whatever(?:0d))"),
199            BString::from(parsed.into_pattern()),
200        );
201    }
202
203    #[test]
204    fn parse_lookahead_groups() {
205        let opts = Options::new();
206        let parsed = super::parse("(?=5)", opts);
207        assert_eq!(BString::from("(?-mix:(?=5))"), BString::from(parsed.into_pattern()));
208        let opts = Options::new();
209        let parsed = super::parse("(?!5)", opts);
210        assert_eq!(BString::from("(?-mix:(?!5))"), BString::from(parsed.into_pattern()));
211    }
212
213    #[test]
214    fn parse_to_fully_expanded_options_inline() {
215        let opts = Options::from(Flags::IGNORECASE | Flags::EXTENDED);
216        let parsed = super::parse("ab+c", opts);
217        assert_eq!(BString::from("(?ix-m:ab+c)"), BString::from(parsed.into_pattern()),);
218        let opts = Options::new();
219        let parsed = super::parse("(?i:.)", opts);
220        assert_eq!(BString::from("(?i-mx:.)"), BString::from(parsed.into_pattern()),);
221        let opts = Options::new();
222        let parsed = super::parse("(?:.)", opts);
223        assert_eq!(BString::from("(?-mix:.)"), BString::from(parsed.into_pattern()),);
224    }
225
226    #[test]
227    fn parse_abusive_options_literals() {
228        let opts = Options::new();
229        let parsed = super::parse("(?mmmmix-miiiix:)", opts);
230        assert_eq!(BString::from("(?-mix:)"), BString::from(parsed.into_pattern()),);
231    }
232}