artichoke_backend/extn/core/regexp/
mod.rs

1//! [ruby/spec](https://github.com/ruby/spec) compliant implementation of
2//! [`Regexp`](https://ruby-doc.org/core-3.1.2/Regexp.html).
3//!
4//! Each function on `Regexp` is implemented as its own module which contains
5//! the `Args` struct for invoking the function.
6
7use std::collections::hash_map::DefaultHasher;
8use std::hash::{Hash, Hasher};
9use std::num::NonZeroUsize;
10use std::str;
11
12#[doc(inline)]
13#[allow(unused_imports)]
14pub use spinoso_regexp::{
15    Config, Encoding, Flags, HIGHEST_MATCH_GROUP, InvalidEncodingError, LAST_MATCH, LAST_MATCHED_STRING, Options,
16    RegexpError, RegexpOption, STRING_LEFT_OF_MATCH, STRING_RIGHT_OF_MATCH, Source,
17    nth_match_group_bytes as nth_match_group,
18};
19
20use crate::convert::implicitly_convert_to_string;
21use crate::extn::core::array::Array;
22use crate::extn::core::symbol::Symbol;
23use crate::extn::prelude::*;
24
25pub mod backend;
26mod boxing;
27pub mod enc;
28pub(in crate::extn) mod mruby;
29pub mod opts;
30pub mod pattern;
31pub mod syntax;
32pub(super) mod trampoline;
33
34#[cfg(feature = "core-regexp-oniguruma")]
35use backend::onig::Onig;
36use backend::regex::utf8::Utf8;
37#[allow(unused_imports)]
38pub use backend::{NilableString, RegexpType, Scan};
39
40pub type NameToCaptureLocations = Vec<(Vec<u8>, Vec<i64>)>;
41
42pub fn clear_capture_globals(interp: &mut Artichoke) -> Result<(), Error> {
43    let mut idx = interp.capture_group_globals()?;
44    while let Some(group) = NonZeroUsize::new(idx) {
45        interp.unset_global_variable(nth_match_group(group))?;
46        idx -= 1;
47    }
48    interp.clear_regexp()?;
49    Ok(())
50}
51
52#[derive(Debug, Clone)]
53pub struct Regexp(Box<dyn RegexpType>);
54
55impl Hash for Regexp {
56    fn hash<H: Hasher>(&self, state: &mut H) {
57        self.0.hash(state);
58    }
59}
60
61impl PartialEq for Regexp {
62    fn eq(&self, other: &Self) -> bool {
63        self.inner() == other.inner()
64    }
65}
66
67impl Eq for Regexp {}
68
69impl TryFrom<Vec<u8>> for Regexp {
70    type Error = Error;
71
72    fn try_from(pattern: Vec<u8>) -> Result<Self, Self::Error> {
73        let config = Config::with_pattern_and_options(pattern, Options::new());
74        let source = Source::from(config.clone());
75        Self::new(source, config, Encoding::new())
76    }
77}
78
79impl Regexp {
80    pub fn new(source: Source, config: Config, encoding: Encoding) -> Result<Self, Error> {
81        #[cfg(feature = "core-regexp-oniguruma")]
82        {
83            // Patterns must be parsable by Oniguruma.
84            let onig = Onig::new(source.clone(), config.clone(), encoding)?;
85            if let Ok(regex) = Utf8::new(source, config, encoding) {
86                Ok(Self(Box::new(regex)))
87            } else {
88                Ok(Self(Box::new(onig)))
89            }
90        }
91        #[cfg(not(feature = "core-regexp-oniguruma"))]
92        {
93            let regex = Utf8::new(source, config, encoding)?;
94            Ok(Self(Box::new(regex)))
95        }
96    }
97
98    pub fn initialize(
99        interp: &mut Artichoke,
100        mut pattern: Value,
101        options: Option<Options>,
102        encoding: Option<Encoding>,
103    ) -> Result<Self, Error> {
104        let source = if let Ok(regexp) = unsafe { Self::unbox_from_value(&mut pattern, interp) } {
105            if options.is_some() || encoding.is_some() {
106                interp.warn(&b"flags ignored when initializing from Regexp"[..])?;
107            }
108            regexp.inner().source().clone()
109        } else {
110            // SAFETY: `bytes` is converted to an owned byte vec before any
111            // additional operations are run on the interpreter which might
112            // trigger a garbage collection of `pattern` and its backing
113            // `RString*`.
114            let bytes = unsafe { implicitly_convert_to_string(interp, &mut pattern)? };
115            Source::with_pattern_and_options(bytes.to_vec(), options.unwrap_or_default())
116        };
117        let pattern = pattern::parse(source.pattern(), source.options());
118        let options = pattern.options();
119        let config = Config::with_pattern_and_options(pattern.into_pattern(), options);
120        Self::new(source, config, encoding.unwrap_or_default())
121    }
122
123    pub fn escape(pattern: &[u8]) -> Result<String, Error> {
124        if let Ok(pattern) = str::from_utf8(pattern) {
125            Ok(syntax::escape(pattern))
126        } else {
127            Err(ArgumentError::with_message("invalid encoding (non UTF-8)").into())
128        }
129    }
130
131    pub fn union<T>(interp: &mut Artichoke, patterns: T) -> Result<Self, Error>
132    where
133        T: IntoIterator<Item = Value>,
134    {
135        fn extract_pattern(interp: &mut Artichoke, value: &mut Value) -> Result<Vec<u8>, Error> {
136            if let Ok(regexp) = unsafe { Regexp::unbox_from_value(value, interp) } {
137                let source = regexp.inner().config();
138                Ok(source.pattern().to_vec())
139            } else {
140                // SAFETY: `bytes` is converted to an owned `String` before any
141                // additional operations are run on the interpreter which might
142                // trigger a garbage collection of `pattern` and its backing
143                // `RString*`.
144                let bytes = unsafe { implicitly_convert_to_string(interp, value)? };
145                if let Ok(pattern) = str::from_utf8(bytes) {
146                    Ok(syntax::escape(pattern).into_bytes())
147                } else {
148                    Err(ArgumentError::with_message("invalid encoding (non UTF-8)").into())
149                }
150            }
151        }
152        let mut iter = patterns.into_iter();
153        let pattern = if let Some(mut first) = iter.next() {
154            if let Some(mut second) = iter.next() {
155                let mut patterns = vec![
156                    extract_pattern(interp, &mut first)?,
157                    extract_pattern(interp, &mut second)?,
158                ];
159                for mut value in iter {
160                    patterns.push(extract_pattern(interp, &mut value)?);
161                }
162                bstr::join(b"|", patterns)
163            } else if let Ok(ary) = unsafe { Array::unbox_from_value(&mut first, interp) } {
164                let mut patterns = Vec::with_capacity(ary.len());
165                for mut value in &*ary {
166                    patterns.push(extract_pattern(interp, &mut value)?);
167                }
168                bstr::join(b"|", patterns)
169            } else {
170                extract_pattern(interp, &mut first)?
171            }
172        } else {
173            b"(?!)".to_vec()
174        };
175
176        let config = {
177            let pattern = pattern::parse(&pattern, Options::new());
178            let options = pattern.options();
179            Config::with_pattern_and_options(pattern.into_pattern(), options)
180        };
181        let source = Source::with_pattern_and_options(pattern, Options::new());
182        Self::new(source, config, Encoding::new())
183    }
184
185    #[inline]
186    #[must_use]
187    pub fn inner(&self) -> &dyn RegexpType {
188        self.0.as_ref()
189    }
190
191    pub fn case_compare(&self, interp: &mut Artichoke, mut other: Value) -> Result<bool, Error> {
192        let pattern_vec;
193        let pattern = if let Ruby::Symbol = other.ruby_type() {
194            let symbol = unsafe { Symbol::unbox_from_value(&mut other, interp)? };
195            pattern_vec = symbol.bytes(interp).to_vec();
196            pattern_vec.as_slice()
197        } else if let Ok(pattern) = unsafe { implicitly_convert_to_string(interp, &mut other) } {
198            // SAFETY: `pattern` is converted to an owned byte vec before any
199            // intervening operations on the VM which may trigger a garbage
200            // collection of the `RString*` that backs `other`.
201            pattern_vec = pattern.to_vec();
202            pattern_vec.as_slice()
203        } else {
204            interp.unset_global_variable(LAST_MATCH)?;
205            return Ok(false);
206        };
207        self.0.case_match(interp, pattern)
208    }
209
210    #[must_use]
211    pub fn eql(&self, interp: &mut Artichoke, mut other: Value) -> bool {
212        if let Ok(other) = unsafe { Self::unbox_from_value(&mut other, interp) } {
213            self.inner() == other.inner()
214        } else {
215            false
216        }
217    }
218
219    #[inline]
220    #[must_use]
221    pub fn hash(&self) -> u64 {
222        let mut s = DefaultHasher::new();
223        self.0.hash(&mut s);
224        s.finish()
225    }
226
227    #[inline]
228    #[must_use]
229    pub fn inspect(&self) -> Vec<u8> {
230        self.0.inspect()
231    }
232
233    #[inline]
234    #[must_use]
235    pub fn is_casefold(&self) -> bool {
236        self.0.source().is_casefold()
237    }
238
239    #[must_use]
240    pub fn is_fixed_encoding(&self) -> bool {
241        match self.0.encoding() {
242            Encoding::No | Encoding::None => false,
243            Encoding::Fixed => true,
244        }
245    }
246
247    pub fn is_match(&self, pattern: Option<&[u8]>, pos: Option<i64>) -> Result<bool, Error> {
248        if let Some(pattern) = pattern {
249            self.0.is_match(pattern, pos)
250        } else {
251            Ok(false)
252        }
253    }
254
255    pub fn match_(
256        &self,
257        interp: &mut Artichoke,
258        pattern: Option<&[u8]>,
259        pos: Option<i64>,
260        block: Option<Block>,
261    ) -> Result<Value, Error> {
262        if let Some(pattern) = pattern {
263            self.0.match_(interp, pattern, pos, block)
264        } else {
265            interp.unset_global_variable(LAST_MATCH)?;
266            Ok(Value::nil())
267        }
268    }
269
270    #[inline]
271    pub fn match_operator(&self, interp: &mut Artichoke, pattern: Option<&[u8]>) -> Result<Option<usize>, Error> {
272        if let Some(pattern) = pattern {
273            self.0.match_operator(interp, pattern)
274        } else {
275            Ok(None)
276        }
277    }
278
279    pub fn named_captures(&self) -> Result<NameToCaptureLocations, Error> {
280        let captures = self.0.named_captures()?;
281        let mut converted = Vec::with_capacity(captures.len());
282        for (name, indexes) in captures {
283            let mut fixnums = Vec::with_capacity(indexes.len());
284            for idx in indexes {
285                if let Ok(idx) = i64::try_from(idx) {
286                    fixnums.push(idx);
287                } else {
288                    return Err(ArgumentError::with_message("string too long").into());
289                }
290            }
291            converted.push((name, fixnums));
292        }
293        Ok(converted)
294    }
295
296    #[inline]
297    #[must_use]
298    pub fn names(&self) -> Vec<Vec<u8>> {
299        self.0.names()
300    }
301
302    #[inline]
303    #[must_use]
304    pub fn is_literal(&self) -> bool {
305        self.0.source().options().is_literal()
306    }
307
308    #[inline]
309    #[must_use]
310    pub fn options(&self) -> i64 {
311        let options = self.0.source().options().flags();
312        let encoding = self.0.encoding().flags();
313        i64::from((options | encoding).bits())
314    }
315
316    #[inline]
317    #[must_use]
318    pub fn source(&self) -> &[u8] {
319        self.0.source().pattern()
320    }
321
322    #[inline]
323    #[must_use]
324    pub fn string(&self) -> &[u8] {
325        self.0.string()
326    }
327}
328
329impl From<Box<dyn RegexpType>> for Regexp {
330    fn from(regexp: Box<dyn RegexpType>) -> Self {
331        Self(regexp)
332    }
333}
334
335impl TryConvertMut<(Option<Value>, Option<Value>), (Option<Options>, Option<Encoding>)> for Artichoke {
336    type Error = Error;
337
338    fn try_convert_mut(
339        &mut self,
340        value: (Option<Value>, Option<Value>),
341    ) -> Result<(Option<Options>, Option<Encoding>), Self::Error> {
342        let (options, encoding) = value;
343        if let Some(encoding) = encoding {
344            let encoding = if let Ok(encoding) = self.try_convert_mut(encoding) {
345                Some(encoding)
346            } else {
347                let mut warning = Vec::from(&b"encoding option is ignored -- "[..]);
348                warning.extend(encoding.to_s(self));
349                self.warn(warning.as_slice())?;
350                None
351            };
352            let options = options.map(|options| self.convert_mut(options));
353            Ok((options, encoding))
354        } else if let Some(options) = options {
355            let encoding = if let Ok(encoding) = self.try_convert_mut(options) {
356                Some(encoding)
357            } else {
358                let mut warning = Vec::from(&b"encoding option is ignored -- "[..]);
359                warning.extend(options.to_s(self));
360                self.warn(warning.as_slice())?;
361                None
362            };
363            let options = self.convert_mut(options);
364            Ok((Some(options), encoding))
365        } else {
366            Ok((None, None))
367        }
368    }
369}
370
371#[cfg(test)]
372mod tests {
373    use crate::test::prelude::*;
374
375    const SUBJECT: &str = "Regexp";
376    const FUNCTIONAL_TEST: &[u8] = include_bytes!("regexp_test.rb");
377
378    #[test]
379    fn functional() {
380        let mut interp = interpreter();
381        let result = interp.eval(FUNCTIONAL_TEST);
382        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
383        let result = interp.eval(b"spec");
384        unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
385    }
386}