artichoke_backend/extn/core/regexp/backend/
onig.rs

1use std::collections::HashMap;
2use std::fmt;
3use std::num::NonZeroUsize;
4use std::rc::Rc;
5use std::str;
6
7use onig::{Regex, Syntax};
8use scolapasta_string_escape::format_debug_escape_into;
9
10use super::{NameToCaptureLocations, NilableString};
11use crate::extn::core::matchdata::MatchData;
12use crate::extn::core::regexp::{self, Config, Encoding, Regexp, RegexpType, Scan, Source};
13use crate::extn::prelude::*;
14
15#[derive(Debug, Clone)]
16pub struct Onig {
17    source: Source,
18    config: Config,
19    encoding: Encoding,
20    regex: Rc<Regex>,
21}
22
23impl Onig {
24    pub fn new(source: Source, config: Config, encoding: Encoding) -> Result<Self, Error> {
25        let pattern = str::from_utf8(config.pattern())
26            .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 patterns"))?;
27        let regex = match Regex::with_options(pattern, config.options().into(), Syntax::ruby()) {
28            Ok(regex) => regex,
29            Err(err) if source.is_literal() => return Err(SyntaxError::from(err.description().to_owned()).into()),
30            Err(err) => return Err(RegexpError::from(err.description().to_owned()).into()),
31        };
32        let regexp = Self {
33            source,
34            config,
35            encoding,
36            regex: Rc::new(regex),
37        };
38        Ok(regexp)
39    }
40}
41
42impl fmt::Display for Onig {
43    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
44        let pattern = self.config.pattern();
45        format_debug_escape_into(f, pattern)
46    }
47}
48
49impl RegexpType for Onig {
50    fn box_clone(&self) -> Box<dyn RegexpType> {
51        Box::new(self.clone())
52    }
53
54    fn captures(&self, haystack: &[u8]) -> Result<Option<Vec<NilableString>>, Error> {
55        let haystack = str::from_utf8(haystack)
56            .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
57        if let Some(captures) = self.regex.captures(haystack) {
58            let mut result = Vec::with_capacity(captures.len());
59            for capture in captures.iter() {
60                if let Some(capture) = capture {
61                    result.push(Some(capture.into()));
62                } else {
63                    result.push(None);
64                }
65            }
66            Ok(Some(result))
67        } else {
68            Ok(None)
69        }
70    }
71
72    fn capture_indexes_for_name(&self, name: &[u8]) -> Result<Option<Vec<usize>>, Error> {
73        let mut result = None;
74        self.regex.foreach_name(|group, group_indexes| {
75            if name != group.as_bytes() {
76                // Continue searching through named captures.
77                return true;
78            }
79            let mut indexes = Vec::with_capacity(group_indexes.len());
80            for &index in group_indexes {
81                indexes.push(qed::lossless_cast_u32_to_usize!(index));
82            }
83            result = Some(indexes);
84            false
85        });
86        Ok(result)
87    }
88
89    fn captures_len(&self, haystack: Option<&[u8]>) -> Result<usize, Error> {
90        let result = if let Some(haystack) = haystack {
91            let haystack = str::from_utf8(haystack).map_err(|_| {
92                ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks")
93            })?;
94            self.regex
95                .captures(haystack)
96                .map(|captures| captures.len())
97                .unwrap_or_default()
98        } else {
99            self.regex.captures_len()
100        };
101        Ok(result)
102    }
103
104    fn capture0<'a>(&self, haystack: &'a [u8]) -> Result<Option<&'a [u8]>, Error> {
105        let haystack = str::from_utf8(haystack)
106            .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
107        let result = self
108            .regex
109            .captures(haystack)
110            .and_then(|captures| captures.at(0))
111            .map(str::as_bytes);
112        Ok(result)
113    }
114
115    fn debug(&self) -> String {
116        let mut debug = String::from("/");
117        let mut pattern = String::new();
118        // Explicitly suppress this error because `debug` is infallible and
119        // cannot panic.
120        //
121        // In practice this error will never be triggered since the only
122        // fallible call in `format_debug_escape_into` is to `write!` which
123        // never `panic!`s for a `String` formatter, which we are using here.
124        let _ = format_debug_escape_into(&mut pattern, self.source.pattern());
125        debug.push_str(pattern.replace('/', r"\/").as_str());
126        debug.push('/');
127        debug.push_str(self.source.options().as_display_modifier());
128        debug.push_str(self.encoding.as_modifier_str());
129        debug
130    }
131
132    fn source(&self) -> &Source {
133        &self.source
134    }
135
136    fn config(&self) -> &Config {
137        &self.config
138    }
139
140    fn encoding(&self) -> &Encoding {
141        &self.encoding
142    }
143
144    fn inspect(&self) -> Vec<u8> {
145        // pattern length + 2x '/' + mix + encoding
146        let mut inspect = Vec::with_capacity(self.source.pattern().len() + 2 + 4);
147        inspect.push(b'/');
148        if let Ok(pat) = str::from_utf8(self.source.pattern()) {
149            inspect.extend_from_slice(pat.replace('/', r"\/").as_bytes());
150        } else {
151            inspect.extend_from_slice(self.source.pattern());
152        }
153        inspect.push(b'/');
154        inspect.extend_from_slice(self.source.options().as_display_modifier().as_bytes());
155        inspect.extend_from_slice(self.encoding.as_modifier_str().as_bytes());
156        inspect
157    }
158
159    fn string(&self) -> &[u8] {
160        self.config.pattern()
161    }
162
163    fn case_match(&self, interp: &mut Artichoke, haystack: &[u8]) -> Result<bool, Error> {
164        let haystack = str::from_utf8(haystack)
165            .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
166        regexp::clear_capture_globals(interp)?;
167        if let Some(captures) = self.regex.captures(haystack) {
168            interp.set_capture_group_globals(captures.len())?;
169            let value = interp.try_convert_mut(captures.at(0))?;
170            interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
171
172            for group in 0..captures.len() {
173                let value = interp.try_convert_mut(captures.at(group))?;
174                let group = NonZeroUsize::MIN.saturating_add(group);
175                interp.set_global_variable(regexp::nth_match_group(group), &value)?;
176            }
177
178            if let Some(match_pos) = captures.pos(0) {
179                let pre_match = interp.try_convert_mut(&haystack[..match_pos.0])?;
180                let post_match = interp.try_convert_mut(&haystack[match_pos.1..])?;
181                interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
182                interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
183            }
184            let matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
185            let matchdata = MatchData::alloc_value(matchdata, interp)?;
186            interp.set_global_variable(regexp::LAST_MATCH, &matchdata)?;
187            Ok(true)
188        } else {
189            interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
190            interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
191            Ok(false)
192        }
193    }
194
195    fn is_match(&self, haystack: &[u8], pos: Option<i64>) -> Result<bool, Error> {
196        let haystack = str::from_utf8(haystack)
197            .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
198        let haystack_char_len = haystack.chars().count();
199        let pos = pos.unwrap_or_default();
200        let pos = if let Some(pos) = aref::offset_to_index(pos, haystack_char_len) {
201            pos
202        } else {
203            return Ok(false);
204        };
205        let offset = haystack.chars().take(pos).map(char::len_utf8).sum();
206        if let Some(haystack) = haystack.get(offset..) {
207            Ok(self.regex.find(haystack).is_some())
208        } else {
209            Ok(false)
210        }
211    }
212
213    fn match_(
214        &self,
215        interp: &mut Artichoke,
216        haystack: &[u8],
217        pos: Option<i64>,
218        block: Option<Block>,
219    ) -> Result<Value, Error> {
220        let haystack = str::from_utf8(haystack)
221            .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
222        regexp::clear_capture_globals(interp)?;
223        let haystack_char_len = haystack.chars().count();
224        let pos = pos.unwrap_or_default();
225        let pos = if let Some(pos) = aref::offset_to_index(pos, haystack_char_len) {
226            pos
227        } else {
228            return Ok(Value::nil());
229        };
230        let offset = haystack.chars().take(pos).map(char::len_utf8).sum();
231        let target = if let Some(haystack) = haystack.get(offset..) {
232            haystack
233        } else {
234            interp.unset_global_variable(regexp::LAST_MATCH)?;
235            interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
236            interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
237            return Ok(Value::nil());
238        };
239
240        if let Some(captures) = self.regex.captures(target) {
241            interp.set_capture_group_globals(captures.len())?;
242
243            let value = interp.try_convert_mut(captures.at(0))?;
244            interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
245            for group in 0..captures.len() {
246                let value = interp.try_convert_mut(captures.at(group))?;
247                let group = NonZeroUsize::MIN.saturating_add(group);
248                interp.set_global_variable(regexp::nth_match_group(group), &value)?;
249            }
250
251            let mut matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
252            if let Some(match_pos) = captures.pos(0) {
253                let pre_match = interp.try_convert_mut(&target[..match_pos.0])?;
254                let post_match = interp.try_convert_mut(&target[match_pos.1..])?;
255                interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
256                interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
257                matchdata.set_region(offset + match_pos.0..offset + match_pos.1);
258            }
259            let data = MatchData::alloc_value(matchdata, interp)?;
260            interp.set_global_variable(regexp::LAST_MATCH, &data)?;
261            if let Some(block) = block {
262                let result = block.yield_arg(interp, &data)?;
263                Ok(result)
264            } else {
265                Ok(data)
266            }
267        } else {
268            interp.unset_global_variable(regexp::LAST_MATCH)?;
269            interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
270            interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
271            Ok(Value::nil())
272        }
273    }
274
275    fn match_operator(&self, interp: &mut Artichoke, haystack: &[u8]) -> Result<Option<usize>, Error> {
276        let haystack = str::from_utf8(haystack)
277            .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
278        regexp::clear_capture_globals(interp)?;
279        if let Some(captures) = self.regex.captures(haystack) {
280            interp.set_capture_group_globals(captures.len())?;
281
282            let value = interp.try_convert_mut(captures.at(0))?;
283            interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
284            for group in 0..captures.len() {
285                let value = interp.try_convert_mut(captures.at(group))?;
286                let group = NonZeroUsize::MIN.saturating_add(group);
287                interp.set_global_variable(regexp::nth_match_group(group), &value)?;
288            }
289
290            let matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
291            let data = MatchData::alloc_value(matchdata, interp)?;
292            interp.set_global_variable(regexp::LAST_MATCH, &data)?;
293            if let Some(match_pos) = captures.pos(0) {
294                let pre_match = interp.try_convert_mut(&haystack[..match_pos.0])?;
295                let post_match = interp.try_convert_mut(&haystack[match_pos.1..])?;
296                interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
297                interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
298                let pos = match_pos.0;
299                Ok(Some(pos))
300            } else {
301                Ok(Some(0))
302            }
303        } else {
304            interp.unset_global_variable(regexp::LAST_MATCH)?;
305            interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
306            interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
307            Ok(None)
308        }
309    }
310
311    fn named_captures(&self) -> Result<NameToCaptureLocations, Error> {
312        // Use a Vec of key-value pairs because insertion order matters for spec
313        // compliance.
314        let mut map = vec![];
315        self.regex.foreach_name(|group, group_indexes| {
316            let mut converted = Vec::with_capacity(group_indexes.len());
317            for &index in group_indexes {
318                converted.push(qed::lossless_cast_u32_to_usize!(index));
319            }
320            map.push((group.into(), converted));
321            true
322        });
323        Ok(map)
324    }
325
326    fn named_captures_for_haystack(&self, haystack: &[u8]) -> Result<Option<HashMap<Vec<u8>, NilableString>>, Error> {
327        let haystack = str::from_utf8(haystack)
328            .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
329        if let Some(captures) = self.regex.captures(haystack) {
330            let mut map = HashMap::with_capacity(captures.len());
331            self.regex.foreach_name(|group, group_indexes| {
332                for &index in group_indexes.iter().rev() {
333                    if let Some(capture) = captures.at(qed::lossless_cast_u32_to_usize!(index)) {
334                        map.insert(group.into(), Some(capture.into()));
335                        return true;
336                    }
337                }
338                map.insert(group.into(), None);
339                true
340            });
341            Ok(Some(map))
342        } else {
343            Ok(None)
344        }
345    }
346
347    fn names(&self) -> Vec<Vec<u8>> {
348        let mut names = vec![];
349        let mut capture_names = vec![];
350        self.regex.foreach_name(|group, group_indexes| {
351            capture_names.push((group.as_bytes().to_vec(), group_indexes.to_vec()));
352            true
353        });
354        capture_names.sort_by(|left, right| {
355            let left = left.1.iter().min().copied().unwrap_or(u32::MAX);
356            let right = right.1.iter().min().copied().unwrap_or(u32::MAX);
357            left.cmp(&right)
358        });
359        for (name, _) in capture_names {
360            if !names.contains(&name) {
361                names.push(name);
362            }
363        }
364        names
365    }
366
367    fn pos(&self, haystack: &[u8], at: usize) -> Result<Option<(usize, usize)>, Error> {
368        let haystack = str::from_utf8(haystack)
369            .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
370        let pos = self.regex.captures(haystack).and_then(|captures| captures.pos(at));
371        Ok(pos)
372    }
373
374    fn scan(&self, interp: &mut Artichoke, haystack: &[u8], block: Option<Block>) -> Result<Scan, Error> {
375        let haystack = str::from_utf8(haystack)
376            .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
377        regexp::clear_capture_globals(interp)?;
378        let mut matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
379
380        let len = NonZeroUsize::new(self.regex.captures_len());
381        if let Some(block) = block {
382            if let Some(len) = len {
383                interp.set_capture_group_globals(len.get())?;
384
385                let mut iter = self.regex.captures_iter(haystack).peekable();
386                if iter.peek().is_none() {
387                    interp.unset_global_variable(regexp::LAST_MATCH)?;
388                    return Ok(Scan::Haystack);
389                }
390                for captures in iter {
391                    let fullcapture = interp.try_convert_mut(captures.at(0))?;
392                    interp.set_global_variable(regexp::LAST_MATCHED_STRING, &fullcapture)?;
393
394                    let mut groups = Vec::with_capacity(len.get());
395                    for group in 1..=len.get() {
396                        let capture = captures.at(group);
397                        groups.push(capture);
398                        let capture = interp.try_convert_mut(capture)?;
399                        let group = unsafe { NonZeroUsize::new_unchecked(group) };
400                        interp.set_global_variable(regexp::nth_match_group(group), &capture)?;
401                    }
402
403                    let matched = interp.try_convert_mut(groups)?;
404                    if let Some(pos) = captures.pos(0) {
405                        matchdata.set_region(pos.0..pos.1);
406                    }
407                    let data = MatchData::alloc_value(matchdata.clone(), interp)?;
408                    interp.set_global_variable(regexp::LAST_MATCH, &data)?;
409                    block.yield_arg(interp, &matched)?;
410                    interp.set_global_variable(regexp::LAST_MATCH, &data)?;
411                }
412            } else {
413                let mut iter = self.regex.find_iter(haystack).peekable();
414                if iter.peek().is_none() {
415                    interp.unset_global_variable(regexp::LAST_MATCH)?;
416                    return Ok(Scan::Haystack);
417                }
418                for pos in iter {
419                    let scanned = &haystack[pos.0..pos.1];
420                    let matched = interp.try_convert_mut(scanned)?;
421                    matchdata.set_region(pos.0..pos.1);
422                    let data = MatchData::alloc_value(matchdata.clone(), interp)?;
423                    interp.set_global_variable(regexp::LAST_MATCH, &data)?;
424                    block.yield_arg(interp, &matched)?;
425                    interp.set_global_variable(regexp::LAST_MATCH, &data)?;
426                }
427            }
428            Ok(Scan::Haystack)
429        } else {
430            let mut last_pos = (0, 0);
431            if let Some(len) = len {
432                interp.set_capture_group_globals(len.get())?;
433
434                let mut collected = vec![];
435                let mut iter = self.regex.captures_iter(haystack).peekable();
436                if iter.peek().is_none() {
437                    interp.unset_global_variable(regexp::LAST_MATCH)?;
438                    return Ok(Scan::Collected(Vec::new()));
439                }
440                for captures in iter {
441                    let mut groups = Vec::with_capacity(len.get());
442                    for group in 1..=len.get() {
443                        groups.push(captures.at(group).map(str::as_bytes).map(Vec::from));
444                    }
445
446                    if let Some(pos) = captures.pos(0) {
447                        last_pos = pos;
448                    }
449                    collected.push(groups);
450                }
451                matchdata.set_region(last_pos.0..last_pos.1);
452                let data = MatchData::alloc_value(matchdata, interp)?;
453                interp.set_global_variable(regexp::LAST_MATCH, &data)?;
454
455                let mut iter = collected.iter().enumerate();
456                if let Some((_, fullcapture)) = iter.next() {
457                    let fullcapture = interp.try_convert_mut(fullcapture.as_slice())?;
458                    interp.set_global_variable(regexp::LAST_MATCHED_STRING, &fullcapture)?;
459                }
460                for (group, capture) in iter {
461                    let capture = interp.try_convert_mut(capture.as_slice())?;
462                    let group = unsafe { NonZeroUsize::new_unchecked(group) };
463                    interp.set_global_variable(regexp::nth_match_group(group), &capture)?;
464                }
465                Ok(Scan::Collected(collected))
466            } else {
467                let mut collected = vec![];
468                let mut iter = self.regex.find_iter(haystack).peekable();
469                if iter.peek().is_none() {
470                    interp.unset_global_variable(regexp::LAST_MATCH)?;
471                    return Ok(Scan::Patterns(Vec::new()));
472                }
473                for pos in iter {
474                    let scanned = &haystack[pos.0..pos.1];
475                    last_pos = pos;
476                    collected.push(Vec::from(scanned.as_bytes()));
477                }
478                matchdata.set_region(last_pos.0..last_pos.1);
479                let data = MatchData::alloc_value(matchdata, interp)?;
480                interp.set_global_variable(regexp::LAST_MATCH, &data)?;
481
482                let last_matched = collected.last().map(Vec::as_slice);
483                let last_matched = interp.try_convert_mut(last_matched)?;
484                interp.set_global_variable(regexp::LAST_MATCHED_STRING, &last_matched)?;
485                Ok(Scan::Patterns(collected))
486            }
487        }
488    }
489}