artichoke_backend/extn/core/regexp/
mod.rs1use std::collections::hash_map::DefaultHasher;
8use std::hash::{Hash, Hasher};
9use std::num::NonZeroUsize;
10use std::str;
11
12#[doc(inline)]
13#[allow(unused_imports)]
14pub use spinoso_regexp::{
15 Config, Encoding, Flags, HIGHEST_MATCH_GROUP, InvalidEncodingError, LAST_MATCH, LAST_MATCHED_STRING, Options,
16 RegexpError, RegexpOption, STRING_LEFT_OF_MATCH, STRING_RIGHT_OF_MATCH, Source,
17 nth_match_group_bytes as nth_match_group,
18};
19
20use crate::convert::implicitly_convert_to_string;
21use crate::extn::core::array::Array;
22use crate::extn::core::symbol::Symbol;
23use crate::extn::prelude::*;
24
25pub mod backend;
26mod boxing;
27pub mod enc;
28pub(in crate::extn) mod mruby;
29pub mod opts;
30pub mod pattern;
31pub mod syntax;
32pub(super) mod trampoline;
33
34#[cfg(feature = "core-regexp-oniguruma")]
35use backend::onig::Onig;
36use backend::regex::utf8::Utf8;
37#[allow(unused_imports)]
38pub use backend::{NilableString, RegexpType, Scan};
39
40pub type NameToCaptureLocations = Vec<(Vec<u8>, Vec<i64>)>;
41
42pub fn clear_capture_globals(interp: &mut Artichoke) -> Result<(), Error> {
43 let mut idx = interp.capture_group_globals()?;
44 while let Some(group) = NonZeroUsize::new(idx) {
45 interp.unset_global_variable(nth_match_group(group))?;
46 idx -= 1;
47 }
48 interp.clear_regexp()?;
49 Ok(())
50}
51
52#[derive(Debug, Clone)]
53pub struct Regexp(Box<dyn RegexpType>);
54
55impl Hash for Regexp {
56 fn hash<H: Hasher>(&self, state: &mut H) {
57 self.0.hash(state);
58 }
59}
60
61impl PartialEq for Regexp {
62 fn eq(&self, other: &Self) -> bool {
63 self.inner() == other.inner()
64 }
65}
66
67impl Eq for Regexp {}
68
69impl TryFrom<Vec<u8>> for Regexp {
70 type Error = Error;
71
72 fn try_from(pattern: Vec<u8>) -> Result<Self, Self::Error> {
73 let config = Config::with_pattern_and_options(pattern, Options::new());
74 let source = Source::from(config.clone());
75 Self::new(source, config, Encoding::new())
76 }
77}
78
79impl Regexp {
80 pub fn new(source: Source, config: Config, encoding: Encoding) -> Result<Self, Error> {
81 #[cfg(feature = "core-regexp-oniguruma")]
82 {
83 let onig = Onig::new(source.clone(), config.clone(), encoding)?;
85 if let Ok(regex) = Utf8::new(source, config, encoding) {
86 Ok(Self(Box::new(regex)))
87 } else {
88 Ok(Self(Box::new(onig)))
89 }
90 }
91 #[cfg(not(feature = "core-regexp-oniguruma"))]
92 {
93 let regex = Utf8::new(source, config, encoding)?;
94 Ok(Self(Box::new(regex)))
95 }
96 }
97
98 pub fn initialize(
99 interp: &mut Artichoke,
100 mut pattern: Value,
101 options: Option<Options>,
102 encoding: Option<Encoding>,
103 ) -> Result<Self, Error> {
104 let source = if let Ok(regexp) = unsafe { Self::unbox_from_value(&mut pattern, interp) } {
105 if options.is_some() || encoding.is_some() {
106 interp.warn(&b"flags ignored when initializing from Regexp"[..])?;
107 }
108 regexp.inner().source().clone()
109 } else {
110 let bytes = unsafe { implicitly_convert_to_string(interp, &mut pattern)? };
115 Source::with_pattern_and_options(bytes.to_vec(), options.unwrap_or_default())
116 };
117 let pattern = pattern::parse(source.pattern(), source.options());
118 let options = pattern.options();
119 let config = Config::with_pattern_and_options(pattern.into_pattern(), options);
120 Self::new(source, config, encoding.unwrap_or_default())
121 }
122
123 pub fn escape(pattern: &[u8]) -> Result<String, Error> {
124 if let Ok(pattern) = str::from_utf8(pattern) {
125 Ok(syntax::escape(pattern))
126 } else {
127 Err(ArgumentError::with_message("invalid encoding (non UTF-8)").into())
128 }
129 }
130
131 pub fn union<T>(interp: &mut Artichoke, patterns: T) -> Result<Self, Error>
132 where
133 T: IntoIterator<Item = Value>,
134 {
135 fn extract_pattern(interp: &mut Artichoke, value: &mut Value) -> Result<Vec<u8>, Error> {
136 if let Ok(regexp) = unsafe { Regexp::unbox_from_value(value, interp) } {
137 let source = regexp.inner().config();
138 Ok(source.pattern().to_vec())
139 } else {
140 let bytes = unsafe { implicitly_convert_to_string(interp, value)? };
145 if let Ok(pattern) = str::from_utf8(bytes) {
146 Ok(syntax::escape(pattern).into_bytes())
147 } else {
148 Err(ArgumentError::with_message("invalid encoding (non UTF-8)").into())
149 }
150 }
151 }
152 let mut iter = patterns.into_iter();
153 let pattern = if let Some(mut first) = iter.next() {
154 if let Some(mut second) = iter.next() {
155 let mut patterns = vec![
156 extract_pattern(interp, &mut first)?,
157 extract_pattern(interp, &mut second)?,
158 ];
159 for mut value in iter {
160 patterns.push(extract_pattern(interp, &mut value)?);
161 }
162 bstr::join(b"|", patterns)
163 } else if let Ok(ary) = unsafe { Array::unbox_from_value(&mut first, interp) } {
164 let mut patterns = Vec::with_capacity(ary.len());
165 for mut value in &*ary {
166 patterns.push(extract_pattern(interp, &mut value)?);
167 }
168 bstr::join(b"|", patterns)
169 } else {
170 extract_pattern(interp, &mut first)?
171 }
172 } else {
173 b"(?!)".to_vec()
174 };
175
176 let config = {
177 let pattern = pattern::parse(&pattern, Options::new());
178 let options = pattern.options();
179 Config::with_pattern_and_options(pattern.into_pattern(), options)
180 };
181 let source = Source::with_pattern_and_options(pattern, Options::new());
182 Self::new(source, config, Encoding::new())
183 }
184
185 #[inline]
186 #[must_use]
187 pub fn inner(&self) -> &dyn RegexpType {
188 self.0.as_ref()
189 }
190
191 pub fn case_compare(&self, interp: &mut Artichoke, mut other: Value) -> Result<bool, Error> {
192 let pattern_vec;
193 let pattern = if let Ruby::Symbol = other.ruby_type() {
194 let symbol = unsafe { Symbol::unbox_from_value(&mut other, interp)? };
195 pattern_vec = symbol.bytes(interp).to_vec();
196 pattern_vec.as_slice()
197 } else if let Ok(pattern) = unsafe { implicitly_convert_to_string(interp, &mut other) } {
198 pattern_vec = pattern.to_vec();
202 pattern_vec.as_slice()
203 } else {
204 interp.unset_global_variable(LAST_MATCH)?;
205 return Ok(false);
206 };
207 self.0.case_match(interp, pattern)
208 }
209
210 #[must_use]
211 pub fn eql(&self, interp: &mut Artichoke, mut other: Value) -> bool {
212 if let Ok(other) = unsafe { Self::unbox_from_value(&mut other, interp) } {
213 self.inner() == other.inner()
214 } else {
215 false
216 }
217 }
218
219 #[inline]
220 #[must_use]
221 pub fn hash(&self) -> u64 {
222 let mut s = DefaultHasher::new();
223 self.0.hash(&mut s);
224 s.finish()
225 }
226
227 #[inline]
228 #[must_use]
229 pub fn inspect(&self) -> Vec<u8> {
230 self.0.inspect()
231 }
232
233 #[inline]
234 #[must_use]
235 pub fn is_casefold(&self) -> bool {
236 self.0.source().is_casefold()
237 }
238
239 #[must_use]
240 pub fn is_fixed_encoding(&self) -> bool {
241 match self.0.encoding() {
242 Encoding::No | Encoding::None => false,
243 Encoding::Fixed => true,
244 }
245 }
246
247 pub fn is_match(&self, pattern: Option<&[u8]>, pos: Option<i64>) -> Result<bool, Error> {
248 if let Some(pattern) = pattern {
249 self.0.is_match(pattern, pos)
250 } else {
251 Ok(false)
252 }
253 }
254
255 pub fn match_(
256 &self,
257 interp: &mut Artichoke,
258 pattern: Option<&[u8]>,
259 pos: Option<i64>,
260 block: Option<Block>,
261 ) -> Result<Value, Error> {
262 if let Some(pattern) = pattern {
263 self.0.match_(interp, pattern, pos, block)
264 } else {
265 interp.unset_global_variable(LAST_MATCH)?;
266 Ok(Value::nil())
267 }
268 }
269
270 #[inline]
271 pub fn match_operator(&self, interp: &mut Artichoke, pattern: Option<&[u8]>) -> Result<Option<usize>, Error> {
272 if let Some(pattern) = pattern {
273 self.0.match_operator(interp, pattern)
274 } else {
275 Ok(None)
276 }
277 }
278
279 pub fn named_captures(&self) -> Result<NameToCaptureLocations, Error> {
280 let captures = self.0.named_captures()?;
281 let mut converted = Vec::with_capacity(captures.len());
282 for (name, indexes) in captures {
283 let mut fixnums = Vec::with_capacity(indexes.len());
284 for idx in indexes {
285 if let Ok(idx) = i64::try_from(idx) {
286 fixnums.push(idx);
287 } else {
288 return Err(ArgumentError::with_message("string too long").into());
289 }
290 }
291 converted.push((name, fixnums));
292 }
293 Ok(converted)
294 }
295
296 #[inline]
297 #[must_use]
298 pub fn names(&self) -> Vec<Vec<u8>> {
299 self.0.names()
300 }
301
302 #[inline]
303 #[must_use]
304 pub fn is_literal(&self) -> bool {
305 self.0.source().options().is_literal()
306 }
307
308 #[inline]
309 #[must_use]
310 pub fn options(&self) -> i64 {
311 let options = self.0.source().options().flags();
312 let encoding = self.0.encoding().flags();
313 i64::from((options | encoding).bits())
314 }
315
316 #[inline]
317 #[must_use]
318 pub fn source(&self) -> &[u8] {
319 self.0.source().pattern()
320 }
321
322 #[inline]
323 #[must_use]
324 pub fn string(&self) -> &[u8] {
325 self.0.string()
326 }
327}
328
329impl From<Box<dyn RegexpType>> for Regexp {
330 fn from(regexp: Box<dyn RegexpType>) -> Self {
331 Self(regexp)
332 }
333}
334
335impl TryConvertMut<(Option<Value>, Option<Value>), (Option<Options>, Option<Encoding>)> for Artichoke {
336 type Error = Error;
337
338 fn try_convert_mut(
339 &mut self,
340 value: (Option<Value>, Option<Value>),
341 ) -> Result<(Option<Options>, Option<Encoding>), Self::Error> {
342 let (options, encoding) = value;
343 if let Some(encoding) = encoding {
344 let encoding = if let Ok(encoding) = self.try_convert_mut(encoding) {
345 Some(encoding)
346 } else {
347 let mut warning = Vec::from(&b"encoding option is ignored -- "[..]);
348 warning.extend(encoding.to_s(self));
349 self.warn(warning.as_slice())?;
350 None
351 };
352 let options = options.map(|options| self.convert_mut(options));
353 Ok((options, encoding))
354 } else if let Some(options) = options {
355 let encoding = if let Ok(encoding) = self.try_convert_mut(options) {
356 Some(encoding)
357 } else {
358 let mut warning = Vec::from(&b"encoding option is ignored -- "[..]);
359 warning.extend(options.to_s(self));
360 self.warn(warning.as_slice())?;
361 None
362 };
363 let options = self.convert_mut(options);
364 Ok((Some(options), encoding))
365 } else {
366 Ok((None, None))
367 }
368 }
369}
370
371#[cfg(test)]
372mod tests {
373 use crate::test::prelude::*;
374
375 const SUBJECT: &str = "Regexp";
376 const FUNCTIONAL_TEST: &[u8] = include_bytes!("regexp_test.rb");
377
378 #[test]
379 fn functional() {
380 let mut interp = interpreter();
381 let result = interp.eval(FUNCTIONAL_TEST);
382 unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
383 let result = interp.eval(b"spec");
384 unwrap_or_panic_with_backtrace(&mut interp, SUBJECT, result);
385 }
386}