artichoke_backend/extn/core/regexp/backend/regex/
utf8.rs

1use std::collections::HashMap;
2use std::fmt;
3use std::num::NonZeroUsize;
4use std::str;
5
6use regex::{Match, Regex, RegexBuilder};
7use scolapasta_string_escape::format_debug_escape_into;
8
9use super::super::{NameToCaptureLocations, NilableString};
10use crate::extn::core::matchdata::MatchData;
11use crate::extn::core::regexp::{self, Config, Encoding, Regexp, RegexpType, Scan, Source};
12use crate::extn::prelude::*;
13
14#[derive(Debug, Clone)]
15pub struct Utf8 {
16    source: Source,
17    config: Config,
18    encoding: Encoding,
19    regex: Regex,
20}
21
22impl Utf8 {
23    pub fn new(source: Source, config: Config, encoding: Encoding) -> Result<Self, Error> {
24        let pattern = str::from_utf8(config.pattern()).map_err(|_| {
25            ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 patterns")
26        })?;
27
28        let mut builder = RegexBuilder::new(pattern);
29        builder.case_insensitive(config.options().ignore_case().into());
30        builder.multi_line(config.options().multiline().into());
31        builder.ignore_whitespace(config.options().extended().into());
32
33        let regex = match builder.build() {
34            Ok(regex) => regex,
35            Err(err) if source.is_literal() => {
36                return Err(SyntaxError::from(err.to_string()).into());
37            }
38            Err(err) => return Err(RegexpError::from(err.to_string()).into()),
39        };
40        let regexp = Self {
41            source,
42            config,
43            encoding,
44            regex,
45        };
46        Ok(regexp)
47    }
48}
49
50impl fmt::Display for Utf8 {
51    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
52        let pattern = self.config.pattern();
53        format_debug_escape_into(f, pattern)
54    }
55}
56
57impl RegexpType for Utf8 {
58    fn box_clone(&self) -> Box<dyn RegexpType> {
59        Box::new(self.clone())
60    }
61
62    fn captures(&self, haystack: &[u8]) -> Result<Option<Vec<NilableString>>, Error> {
63        let haystack = str::from_utf8(haystack).map_err(|_| {
64            ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
65        })?;
66        if let Some(captures) = self.regex.captures(haystack) {
67            let mut result = Vec::with_capacity(captures.len());
68            for capture in captures.iter() {
69                if let Some(capture) = capture {
70                    result.push(Some(capture.as_str().into()));
71                } else {
72                    result.push(None);
73                }
74            }
75            Ok(Some(result))
76        } else {
77            Ok(None)
78        }
79    }
80
81    fn capture_indexes_for_name(&self, name: &[u8]) -> Result<Option<Vec<usize>>, Error> {
82        let mut result = vec![];
83        for (index, group) in self.regex.capture_names().enumerate() {
84            if Some(name) == group.map(str::as_bytes) {
85                result.push(index);
86            }
87        }
88        if result.is_empty() { Ok(None) } else { Ok(Some(result)) }
89    }
90
91    fn captures_len(&self, haystack: Option<&[u8]>) -> Result<usize, Error> {
92        let result = if let Some(haystack) = haystack {
93            let haystack = str::from_utf8(haystack).map_err(|_| {
94                ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
95            })?;
96            self.regex
97                .captures(haystack)
98                .map(|captures| captures.len())
99                .unwrap_or_default()
100        } else {
101            self.regex.captures_len()
102        };
103        Ok(result)
104    }
105
106    fn capture0<'a>(&self, haystack: &'a [u8]) -> Result<Option<&'a [u8]>, Error> {
107        let haystack = str::from_utf8(haystack).map_err(|_| {
108            ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
109        })?;
110        let result = self
111            .regex
112            .captures(haystack)
113            .and_then(|captures| captures.get(0))
114            .as_ref()
115            .map(Match::as_str)
116            .map(str::as_bytes);
117        Ok(result)
118    }
119
120    fn debug(&self) -> String {
121        let mut debug = String::from("/");
122        let mut pattern = String::new();
123        // Explicitly suppress this error because `debug` is infallible and
124        // cannot panic.
125        //
126        // In practice this error will never be triggered since the only
127        // fallible call in `format_debug_escape_into` is to `write!` which
128        // never `panic!`s for a `String` formatter, which we are using here.
129        let _ = format_debug_escape_into(&mut pattern, self.source.pattern());
130        debug.push_str(pattern.replace('/', r"\/").as_str());
131        debug.push('/');
132        debug.push_str(self.source.options().as_display_modifier());
133        debug.push_str(self.encoding.as_modifier_str());
134        debug
135    }
136
137    fn source(&self) -> &Source {
138        &self.source
139    }
140
141    fn config(&self) -> &Config {
142        &self.config
143    }
144
145    fn encoding(&self) -> &Encoding {
146        &self.encoding
147    }
148
149    fn inspect(&self) -> Vec<u8> {
150        // pattern length + 2x '/' + mix + encoding
151        let mut inspect = Vec::with_capacity(self.source.pattern().len() + 2 + 4);
152        inspect.push(b'/');
153        if let Ok(pat) = str::from_utf8(self.source.pattern()) {
154            inspect.extend_from_slice(pat.replace('/', r"\/").as_bytes());
155        } else {
156            inspect.extend_from_slice(self.source.pattern());
157        }
158        inspect.push(b'/');
159        inspect.extend_from_slice(self.source.options().as_display_modifier().as_bytes());
160        inspect.extend_from_slice(self.encoding.as_modifier_str().as_bytes());
161        inspect
162    }
163
164    fn string(&self) -> &[u8] {
165        self.config.pattern()
166    }
167
168    fn case_match(&self, interp: &mut Artichoke, haystack: &[u8]) -> Result<bool, Error> {
169        let haystack = str::from_utf8(haystack).map_err(|_| {
170            ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystack")
171        })?;
172        regexp::clear_capture_globals(interp)?;
173        if let Some(captures) = self.regex.captures(haystack) {
174            // per the [docs] for `captures.len()`:
175            //
176            // > This is always at least 1, since every regex has at least one
177            // > capture group that corresponds to the full match.
178            //
179            // [docs]: https://docs.rs/regex/1.3.4/regex/struct.Captures.html#method.len
180            interp.set_capture_group_globals(captures.len().checked_sub(1).unwrap_or_default())?;
181
182            let fullmatch = captures.get(0).as_ref().map(Match::as_str).map(str::as_bytes);
183            let value = interp.try_convert_mut(fullmatch)?;
184            interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
185            for group in 1..captures.len() {
186                let capture = captures.get(group).as_ref().map(Match::as_str).map(str::as_bytes);
187                let value = interp.try_convert_mut(capture)?;
188                let group = unsafe { NonZeroUsize::new_unchecked(group) };
189                interp.set_global_variable(regexp::nth_match_group(group), &value)?;
190            }
191
192            if let Some(match_pos) = captures.get(0) {
193                let pre_match = interp.try_convert_mut(&haystack[..match_pos.start()])?;
194                let post_match = interp.try_convert_mut(&haystack[match_pos.end()..])?;
195                interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
196                interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
197            }
198            let matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
199            let matchdata = MatchData::alloc_value(matchdata, interp)?;
200            interp.set_global_variable(regexp::LAST_MATCH, &matchdata)?;
201            Ok(true)
202        } else {
203            interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
204            interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
205            Ok(false)
206        }
207    }
208
209    fn is_match(&self, haystack: &[u8], pos: Option<i64>) -> Result<bool, Error> {
210        let haystack = str::from_utf8(haystack).map_err(|_| {
211            ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystack")
212        })?;
213        let haystack_char_len = haystack.chars().count();
214        let pos = pos.unwrap_or_default();
215        let pos = if let Some(pos) = aref::offset_to_index(pos, haystack_char_len) {
216            pos
217        } else {
218            return Ok(false);
219        };
220        let offset = haystack.chars().take(pos).map(char::len_utf8).sum();
221        if let Some(haystack) = haystack.get(offset..) {
222            Ok(self.regex.find(haystack).is_some())
223        } else {
224            Ok(false)
225        }
226    }
227
228    fn match_(
229        &self,
230        interp: &mut Artichoke,
231        haystack: &[u8],
232        pos: Option<i64>,
233        block: Option<Block>,
234    ) -> Result<Value, Error> {
235        let haystack = str::from_utf8(haystack).map_err(|_| {
236            ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
237        })?;
238        regexp::clear_capture_globals(interp)?;
239        let haystack_char_len = haystack.chars().count();
240        let pos = pos.unwrap_or_default();
241        let pos = if let Some(pos) = aref::offset_to_index(pos, haystack_char_len) {
242            pos
243        } else {
244            return Ok(Value::nil());
245        };
246        let offset = haystack.chars().take(pos).map(char::len_utf8).sum();
247        let target = if let Some(haystack) = haystack.get(offset..) {
248            haystack
249        } else {
250            interp.unset_global_variable(regexp::LAST_MATCH)?;
251            interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
252            interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
253            return Ok(Value::nil());
254        };
255        if let Some(captures) = self.regex.captures(target) {
256            // per the [docs] for `captures.len()`:
257            //
258            // > This is always at least 1, since every regex has at least one
259            // > capture group that corresponds to the full match.
260            //
261            // [docs]: https://docs.rs/regex/1.3.4/regex/struct.Captures.html#method.len
262            interp.set_capture_group_globals(captures.len().checked_sub(1).unwrap_or_default())?;
263
264            let fullmatch = captures.get(0).as_ref().map(Match::as_str).map(str::as_bytes);
265            let value = interp.try_convert_mut(fullmatch)?;
266            interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
267            for group in 1..captures.len() {
268                let capture = captures.get(group).as_ref().map(Match::as_str).map(str::as_bytes);
269                let value = interp.try_convert_mut(capture)?;
270                let group = unsafe { NonZeroUsize::new_unchecked(group) };
271                interp.set_global_variable(regexp::nth_match_group(group), &value)?;
272            }
273
274            let mut matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
275            if let Some(match_pos) = captures.get(0) {
276                let pre_match = interp.try_convert_mut(&target[..match_pos.start()])?;
277                let post_match = interp.try_convert_mut(&target[match_pos.end()..])?;
278                interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
279                interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
280                matchdata.set_region(offset + match_pos.start()..offset + match_pos.end());
281            }
282            let data = MatchData::alloc_value(matchdata, interp)?;
283            interp.set_global_variable(regexp::LAST_MATCH, &data)?;
284            if let Some(block) = block {
285                let result = block.yield_arg(interp, &data)?;
286                Ok(result)
287            } else {
288                Ok(data)
289            }
290        } else {
291            interp.unset_global_variable(regexp::LAST_MATCH)?;
292            interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
293            interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
294            Ok(Value::nil())
295        }
296    }
297
298    fn match_operator(&self, interp: &mut Artichoke, haystack: &[u8]) -> Result<Option<usize>, Error> {
299        let haystack = str::from_utf8(haystack).map_err(|_| {
300            ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
301        })?;
302        regexp::clear_capture_globals(interp)?;
303        if let Some(captures) = self.regex.captures(haystack) {
304            // per the [docs] for `captures.len()`:
305            //
306            // > This is always at least 1, since every regex has at least one
307            // > capture group that corresponds to the full match.
308            //
309            // [docs]: https://docs.rs/regex/1.3.4/regex/struct.Captures.html#method.len
310            interp.set_capture_group_globals(captures.len().checked_sub(1).unwrap_or_default())?;
311
312            let fullmatch = captures.get(0).as_ref().map(Match::as_str).map(str::as_bytes);
313            let value = interp.try_convert_mut(fullmatch)?;
314            interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
315            for group in 1..captures.len() {
316                let capture = captures.get(group).as_ref().map(Match::as_str).map(str::as_bytes);
317                let value = interp.try_convert_mut(capture)?;
318                let group = unsafe { NonZeroUsize::new_unchecked(group) };
319                interp.set_global_variable(regexp::nth_match_group(group), &value)?;
320            }
321
322            let matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
323            let data = MatchData::alloc_value(matchdata, interp)?;
324            interp.set_global_variable(regexp::LAST_MATCH, &data)?;
325            if let Some(match_pos) = captures.get(0) {
326                let pre_match = interp.try_convert_mut(&haystack[..match_pos.start()])?;
327                let post_match = interp.try_convert_mut(&haystack[match_pos.end()..])?;
328                interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
329                interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
330                let pos = match_pos.start();
331                Ok(Some(pos))
332            } else {
333                Ok(Some(0))
334            }
335        } else {
336            interp.unset_global_variable(regexp::LAST_MATCH)?;
337            interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
338            interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
339            Ok(None)
340        }
341    }
342
343    fn named_captures(&self) -> Result<NameToCaptureLocations, Error> {
344        // Use a Vec of key-value pairs because insertion order matters for spec
345        // compliance.
346        let mut map = vec![];
347        for group in self.regex.capture_names().flatten() {
348            if let Some(indexes) = self.capture_indexes_for_name(group.as_bytes())? {
349                map.push((group.into(), indexes));
350            }
351        }
352        Ok(map)
353    }
354
355    fn named_captures_for_haystack(&self, haystack: &[u8]) -> Result<Option<HashMap<Vec<u8>, NilableString>>, Error> {
356        let haystack = str::from_utf8(haystack).map_err(|_| {
357            ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
358        })?;
359        if let Some(captures) = self.regex.captures(haystack) {
360            let mut map = HashMap::with_capacity(captures.len());
361            for (group, group_indexes) in self.named_captures()? {
362                let capture = group_indexes
363                    .iter()
364                    .rev()
365                    .copied()
366                    .find_map(|index| captures.get(index));
367                if let Some(capture) = capture {
368                    map.insert(group, Some(capture.as_str().into()));
369                } else {
370                    map.insert(group, None);
371                }
372            }
373            Ok(Some(map))
374        } else {
375            Ok(None)
376        }
377    }
378
379    fn names(&self) -> Vec<Vec<u8>> {
380        let mut names = vec![];
381        let mut capture_names = self.named_captures().unwrap_or_default();
382        capture_names.sort_by(|left, right| {
383            let left = left.1.iter().min().copied().unwrap_or(usize::MAX);
384            let right = right.1.iter().min().copied().unwrap_or(usize::MAX);
385            left.cmp(&right)
386        });
387        for (name, _) in capture_names {
388            if !names.contains(&name) {
389                names.push(name);
390            }
391        }
392        names
393    }
394
395    fn pos(&self, haystack: &[u8], at: usize) -> Result<Option<(usize, usize)>, Error> {
396        let haystack = str::from_utf8(haystack).map_err(|_| {
397            ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
398        })?;
399        let pos = self
400            .regex
401            .captures(haystack)
402            .and_then(|captures| captures.get(at))
403            .map(|match_pos| (match_pos.start(), match_pos.end()));
404        Ok(pos)
405    }
406
407    fn scan(&self, interp: &mut Artichoke, haystack: &[u8], block: Option<Block>) -> Result<Scan, Error> {
408        let haystack = str::from_utf8(haystack).map_err(|_| {
409            ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
410        })?;
411        regexp::clear_capture_globals(interp)?;
412        let mut matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
413
414        // regex crate always includes the zero group in the captures length.
415        let len = self.regex.captures_len().checked_sub(1);
416        interp.set_capture_group_globals(len.unwrap_or_default())?;
417        let len = len.and_then(NonZeroUsize::new);
418        if let Some(block) = block {
419            if let Some(len) = len {
420                let mut iter = self.regex.captures_iter(haystack).peekable();
421                if iter.peek().is_none() {
422                    interp.unset_global_variable(regexp::LAST_MATCH)?;
423                    return Ok(Scan::Haystack);
424                }
425                for captures in iter {
426                    let matched = captures.get(0).as_ref().map(Match::as_str).map(str::as_bytes);
427                    let capture = interp.try_convert_mut(matched)?;
428                    interp.set_global_variable(regexp::LAST_MATCHED_STRING, &capture)?;
429
430                    let mut groups = Vec::with_capacity(len.get() - 1);
431                    for group in 1..=len.get() {
432                        let matched = captures.get(group).as_ref().map(Match::as_str).map(str::as_bytes);
433                        let capture = interp.try_convert_mut(matched)?;
434                        let group = unsafe { NonZeroUsize::new_unchecked(group) };
435                        interp.set_global_variable(regexp::nth_match_group(group), &capture)?;
436                        groups.push(matched);
437                    }
438
439                    let matched = interp.try_convert_mut(groups)?;
440                    if let Some(pos) = captures.get(0) {
441                        matchdata.set_region(pos.start()..pos.end());
442                    }
443                    let data = MatchData::alloc_value(matchdata.clone(), interp)?;
444                    interp.set_global_variable(regexp::LAST_MATCH, &data)?;
445                    block.yield_arg(interp, &matched)?;
446                    interp.set_global_variable(regexp::LAST_MATCH, &data)?;
447                }
448            } else {
449                let mut iter = self.regex.find_iter(haystack).peekable();
450                if iter.peek().is_none() {
451                    interp.unset_global_variable(regexp::LAST_MATCH)?;
452                    return Ok(Scan::Haystack);
453                }
454                for pos in iter {
455                    let scanned = &haystack[pos.start()..pos.end()];
456                    let matched = interp.try_convert_mut(scanned)?;
457                    matchdata.set_region(pos.start()..pos.end());
458                    let data = MatchData::alloc_value(matchdata.clone(), interp)?;
459                    interp.set_global_variable(regexp::LAST_MATCH, &data)?;
460                    block.yield_arg(interp, &matched)?;
461                    interp.set_global_variable(regexp::LAST_MATCH, &data)?;
462                }
463            }
464            Ok(Scan::Haystack)
465        } else {
466            let mut last_pos = (0, 0);
467            if let Some(len) = len {
468                let mut collected = vec![];
469                let mut iter = self.regex.captures_iter(haystack).peekable();
470                if iter.peek().is_none() {
471                    interp.unset_global_variable(regexp::LAST_MATCH)?;
472                    return Ok(Scan::Collected(Vec::new()));
473                }
474                for captures in iter {
475                    let mut groups = Vec::with_capacity(len.get() - 1);
476                    for group in 1..=len.get() {
477                        let matched = captures
478                            .get(group)
479                            .as_ref()
480                            .map(Match::as_str)
481                            .map(str::as_bytes)
482                            .map(Vec::from);
483                        groups.push(matched);
484                    }
485
486                    if let Some(pos) = captures.get(0) {
487                        last_pos = (pos.start(), pos.end());
488                    }
489                    collected.push(groups);
490                }
491                matchdata.set_region(last_pos.0..last_pos.1);
492                let data = MatchData::alloc_value(matchdata, interp)?;
493                interp.set_global_variable(regexp::LAST_MATCH, &data)?;
494                let mut iter = collected.iter().enumerate();
495                if let Some((_, fullcapture)) = iter.next() {
496                    let fullcapture = interp.try_convert_mut(fullcapture.as_slice())?;
497                    interp.set_global_variable(regexp::LAST_MATCHED_STRING, &fullcapture)?;
498                }
499                for (group, capture) in iter {
500                    let capture = interp.try_convert_mut(capture.as_slice())?;
501                    let group = unsafe { NonZeroUsize::new_unchecked(group) };
502                    interp.set_global_variable(regexp::nth_match_group(group), &capture)?;
503                }
504                Ok(Scan::Collected(collected))
505            } else {
506                let mut collected = vec![];
507                let mut iter = self.regex.find_iter(haystack).peekable();
508                if iter.peek().is_none() {
509                    interp.unset_global_variable(regexp::LAST_MATCH)?;
510                    return Ok(Scan::Patterns(Vec::new()));
511                }
512                for pos in iter {
513                    let scanned = &haystack[pos.start()..pos.end()];
514                    last_pos = (pos.start(), pos.end());
515                    collected.push(Vec::from(scanned.as_bytes()));
516                }
517                matchdata.set_region(last_pos.0..last_pos.1);
518                let data = MatchData::alloc_value(matchdata, interp)?;
519                interp.set_global_variable(regexp::LAST_MATCH, &data)?;
520                let last_matched = collected.last().map(Vec::as_slice);
521                let last_matched = interp.try_convert_mut(last_matched)?;
522                interp.set_global_variable(regexp::LAST_MATCHED_STRING, &last_matched)?;
523                Ok(Scan::Patterns(collected))
524            }
525        }
526    }
527}