artichoke_backend/extn/core/matchdata/
mod.rs

1//! An implementation of [`MatchData`][matchdata] for all [`Regexp`] backends.
2//!
3//! `MatchData` is mostly implemented in Rust with some methods implemented in
4//! Ruby. `MatchData` lazily computes matches by delegating to its underlying
5//! [`Regexp`] instance on access.
6//!
7//! `MatchData` passes all non-skipped [ruby/spec][rubyspec]s.
8//!
9//! [matchdata]: https://ruby-doc.org/core-3.1.2/MatchData.html
10//! [rubyspec]: https://github.com/ruby/spec
11
12use std::collections::HashMap;
13use std::fmt::Write as _;
14use std::ops::{Bound, RangeBounds};
15use std::str;
16
17use bstr::BString;
18use scolapasta_string_escape::format_debug_escape_into;
19
20use crate::convert::{implicitly_convert_to_int, implicitly_convert_to_string};
21use crate::extn::core::regexp::Regexp;
22use crate::extn::core::regexp::backend::NilableString;
23use crate::extn::core::symbol::Symbol;
24use crate::extn::prelude::*;
25use crate::fmt::WriteError;
26
27mod boxing;
28pub(in crate::extn) mod mruby;
29pub(super) mod trampoline;
30
31#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
32pub struct Region {
33    start: Bound<usize>,
34    end: Bound<usize>,
35}
36
37impl Region {
38    fn from_range<R>(bounds: R) -> Self
39    where
40        R: RangeBounds<usize>,
41    {
42        let start = bounds.start_bound().cloned();
43        let end = bounds.end_bound().cloned();
44        Region { start, end }
45    }
46
47    fn offset(&self) -> usize {
48        match self.start {
49            Bound::Included(bound) => bound,
50            Bound::Excluded(bound) => bound.checked_sub(1).unwrap_or_default(),
51            Bound::Unbounded => 0,
52        }
53    }
54}
55
56impl RangeBounds<usize> for Region {
57    fn start_bound(&self) -> Bound<&usize> {
58        self.start.as_ref()
59    }
60
61    fn end_bound(&self) -> Bound<&usize> {
62        self.end.as_ref()
63    }
64}
65
66#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
67pub enum Capture<'a> {
68    GroupIndex(i64),
69    GroupName(&'a [u8]),
70}
71
72#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
73pub enum CaptureExtract<'a> {
74    GroupIndex(i64),
75    GroupName(&'a [u8]),
76    Symbol(Symbol),
77}
78
79impl<'a> TryConvertMut<&'a mut Value, CaptureExtract<'a>> for Artichoke {
80    type Error = Error;
81
82    fn try_convert_mut(&mut self, value: &'a mut Value) -> Result<CaptureExtract<'a>, Self::Error> {
83        if let Ok(idx) = implicitly_convert_to_int(self, *value) {
84            Ok(CaptureExtract::GroupIndex(idx))
85        } else if let Ok(symbol) = unsafe { Symbol::unbox_from_value(value, self) } {
86            let sym = symbol.id();
87            Ok(CaptureExtract::Symbol(sym.into()))
88        } else {
89            let name = unsafe { implicitly_convert_to_string(self, value)? };
90            Ok(CaptureExtract::GroupName(name))
91        }
92    }
93}
94
95#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
96pub enum CaptureAt<'a> {
97    GroupIndex(i64),
98    GroupName(&'a [u8]),
99    StartLen(i64, i64),
100}
101
102#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
103pub enum CaptureMatch {
104    None,
105    Single(Option<Vec<u8>>),
106    Range(Vec<Option<Vec<u8>>>),
107}
108
109impl TryConvertMut<CaptureMatch, Value> for Artichoke {
110    type Error = Error;
111
112    fn try_convert_mut(&mut self, value: CaptureMatch) -> Result<Value, Self::Error> {
113        match value {
114            CaptureMatch::None => Ok(Value::nil()),
115            CaptureMatch::Single(capture) => self.try_convert_mut(capture),
116            CaptureMatch::Range(captures) => self.try_convert_mut(captures),
117        }
118    }
119}
120
121#[derive(Debug, Clone, Hash, PartialEq, Eq)]
122pub struct MatchData {
123    haystack: BString,
124    regexp: Regexp,
125    region: Region,
126}
127
128impl MatchData {
129    #[must_use]
130    pub fn new<R>(haystack: Vec<u8>, regexp: Regexp, bounds: R) -> Self
131    where
132        R: RangeBounds<usize>,
133    {
134        let region = Region::from_range(bounds);
135        Self {
136            haystack: haystack.into(),
137            regexp,
138            region,
139        }
140    }
141
142    pub fn set_region<R>(&mut self, bounds: R)
143    where
144        R: RangeBounds<usize>,
145    {
146        self.region = Region::from_range(bounds);
147    }
148
149    #[must_use]
150    pub fn matched_region(&self) -> &[u8] {
151        let matched = match (self.region.start, self.region.end) {
152            (Bound::Included(start), Bound::Included(end)) => self.haystack.get(start..=end),
153            (Bound::Included(start), Bound::Excluded(end)) => self.haystack.get(start..end),
154            (Bound::Included(start), Bound::Unbounded) => self.haystack.get(start..),
155            (Bound::Excluded(start), Bound::Included(end)) => self.haystack.get((start + 1)..=end),
156            (Bound::Excluded(start), Bound::Excluded(end)) => self.haystack.get((start + 1)..end),
157            (Bound::Excluded(start), Bound::Unbounded) => self.haystack.get(start + 1..),
158            (Bound::Unbounded, Bound::Included(end)) => self.haystack.get(..=end),
159            (Bound::Unbounded, Bound::Excluded(end)) => self.haystack.get(..end),
160            (Bound::Unbounded, Bound::Unbounded) => self.haystack.get(..),
161        };
162        matched.unwrap_or_default()
163    }
164
165    #[inline]
166    pub fn begin(&self, capture: Capture<'_>) -> Result<Option<usize>, Error> {
167        if let Some([begin, _]) = self.offset(capture)? {
168            Ok(Some(begin))
169        } else {
170            Ok(None)
171        }
172    }
173
174    pub fn capture_at(&self, at: CaptureAt<'_>) -> Result<CaptureMatch, Error> {
175        let haystack = self.matched_region();
176        let captures = if let Some(captures) = self.regexp.inner().captures(haystack)? {
177            captures
178        } else {
179            return Ok(CaptureMatch::None);
180        };
181        match at {
182            CaptureAt::GroupIndex(index) => match aref::offset_to_index(index, captures.len()) {
183                None => Ok(CaptureMatch::None),
184                Some(idx) => {
185                    if let Some(capture) = captures.into_iter().nth(idx) {
186                        Ok(CaptureMatch::Single(capture))
187                    } else {
188                        Ok(CaptureMatch::None)
189                    }
190                }
191            },
192            CaptureAt::GroupName(name) => {
193                let indexes = self.regexp.inner().capture_indexes_for_name(name)?;
194                if let Some(indexes) = indexes {
195                    let capture = indexes
196                        .iter()
197                        .copied()
198                        .filter_map(|index| captures.get(index).and_then(Option::as_deref))
199                        .last();
200                    Ok(CaptureMatch::Single(capture.map(<[_]>::to_vec)))
201                } else {
202                    let mut message = String::from("undefined group name reference: \"");
203                    format_debug_escape_into(&mut message, name).map_err(WriteError::from)?;
204                    message.push('"');
205                    Err(IndexError::from(message).into())
206                }
207            }
208            CaptureAt::StartLen(start, len) => {
209                if let Ok(len) = usize::try_from(len) {
210                    let start = if let Some(start) = aref::offset_to_index(start, captures.len()) {
211                        start
212                    } else {
213                        return Ok(CaptureMatch::None);
214                    };
215                    let matches = captures.into_iter().skip(start).take(len).collect::<Vec<_>>();
216                    Ok(CaptureMatch::Range(matches))
217                } else {
218                    Ok(CaptureMatch::None)
219                }
220            }
221        }
222    }
223
224    pub fn captures(&self) -> Result<Option<Vec<Option<Vec<u8>>>>, Error> {
225        let haystack = self.matched_region();
226        let captures = self.regexp.inner().captures(haystack)?;
227        if let Some(mut captures) = captures {
228            // Panic safety:
229            //
230            // All Regexp matches are guaranteed to have a zero capture group.
231            captures.remove(0);
232            Ok(Some(captures))
233        } else {
234            Ok(None)
235        }
236    }
237
238    #[inline]
239    pub fn end(&self, capture: Capture<'_>) -> Result<Option<usize>, Error> {
240        if let Some([_, end]) = self.offset(capture)? {
241            Ok(Some(end))
242        } else {
243            Ok(None)
244        }
245    }
246
247    #[inline]
248    pub fn len(&self) -> Result<usize, Error> {
249        let haystack = self.matched_region();
250        self.regexp.inner().captures_len(Some(haystack))
251    }
252
253    #[inline]
254    pub fn named_captures(&self) -> Result<Option<HashMap<Vec<u8>, NilableString>>, Error> {
255        let haystack = self.matched_region();
256        self.regexp.inner().named_captures_for_haystack(haystack)
257    }
258
259    #[inline]
260    #[must_use]
261    pub fn names(&self) -> Vec<Vec<u8>> {
262        self.regexp.names()
263    }
264
265    pub fn offset(&self, capture: Capture<'_>) -> Result<Option<[usize; 2]>, Error> {
266        let haystack = self.matched_region();
267        let index = match capture {
268            Capture::GroupIndex(index) => {
269                let captures_len = self.regexp.inner().captures_len(Some(haystack))?;
270                match usize::try_from(index) {
271                    Ok(idx) if idx < captures_len => idx,
272                    _ => {
273                        let mut message = String::new();
274                        write!(&mut message, "index {index} out of matches").map_err(WriteError::from)?;
275                        return Err(IndexError::from(message).into());
276                    }
277                }
278            }
279            Capture::GroupName(name) => {
280                let indexes = self.regexp.inner().capture_indexes_for_name(name)?;
281                if let Some(index) = indexes.and_then(|indexes| indexes.last().copied()) {
282                    index
283                } else {
284                    return Ok(None);
285                }
286            }
287        };
288        if let Some((begin, end)) = self.regexp.inner().pos(haystack, index)? {
289            let begin = if let Some(Ok(haystack)) = haystack.get(..begin).map(str::from_utf8) {
290                haystack.chars().count()
291            } else {
292                haystack.len()
293            };
294            let end = if let Some(Ok(haystack)) = haystack.get(..end).map(str::from_utf8) {
295                haystack.chars().count()
296            } else {
297                haystack.len()
298            };
299            let offset = self.region.offset();
300            Ok(Some([offset + begin, offset + end]))
301        } else {
302            Ok(None)
303        }
304    }
305
306    #[must_use]
307    pub fn pre(&self) -> &[u8] {
308        let pre = match self.region.start {
309            Bound::Included(start) => self.haystack.get(..start),
310            Bound::Excluded(start) => self.haystack.get(..=start),
311            Bound::Unbounded => return &[],
312        };
313        pre.unwrap_or_else(|| {
314            // if start is out of range, the whole haystack is the pre match
315            self.haystack.as_slice()
316        })
317    }
318
319    #[must_use]
320    pub fn post(&self) -> &[u8] {
321        let post = match self.region.end {
322            Bound::Included(end) => self.haystack.get(end + 1..),
323            Bound::Excluded(end) => self.haystack.get(end..),
324            Bound::Unbounded => return &[],
325        };
326        // if end is out of range, there is no post match
327        post.unwrap_or_default()
328    }
329
330    #[inline]
331    #[must_use]
332    pub fn regexp(&self) -> &Regexp {
333        &self.regexp
334    }
335
336    #[inline]
337    #[must_use]
338    pub fn string(&self) -> &[u8] {
339        self.haystack.as_slice()
340    }
341
342    #[inline]
343    pub fn to_a(&self) -> Result<Option<Vec<NilableString>>, Error> {
344        let haystack = self.matched_region();
345        self.regexp.inner().captures(haystack)
346    }
347
348    #[inline]
349    pub fn to_s(&self) -> Result<Option<&[u8]>, Error> {
350        let haystack = self.matched_region();
351        self.regexp.inner().capture0(haystack)
352    }
353}
354
355#[cfg(test)]
356mod tests {
357    use crate::test::prelude::*;
358
359    const SUBJECT: &str = "MatchData";
360    const FUNCTIONAL_TEST: &[u8] = include_bytes!("matchdata_functional_test.rb");
361
362    #[test]
363    fn functional() {
364        let mut interp = interpreter();
365        let result = interp.eval(FUNCTIONAL_TEST);
366        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
367        let result = interp.eval(b"spec");
368        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
369    }
370}