artichoke_backend/extn/core/regexp/backend/
onig.rs1use std::collections::HashMap;
2use std::fmt;
3use std::num::NonZeroUsize;
4use std::rc::Rc;
5use std::str;
6
7use onig::{Regex, Syntax};
8use scolapasta_string_escape::format_debug_escape_into;
9
10use super::{NameToCaptureLocations, NilableString};
11use crate::extn::core::matchdata::MatchData;
12use crate::extn::core::regexp::{self, Config, Encoding, Regexp, RegexpType, Scan, Source};
13use crate::extn::prelude::*;
14
15#[derive(Debug, Clone)]
16pub struct Onig {
17 source: Source,
18 config: Config,
19 encoding: Encoding,
20 regex: Rc<Regex>,
21}
22
23impl Onig {
24 pub fn new(source: Source, config: Config, encoding: Encoding) -> Result<Self, Error> {
25 let pattern = str::from_utf8(config.pattern())
26 .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 patterns"))?;
27 let regex = match Regex::with_options(pattern, config.options().into(), Syntax::ruby()) {
28 Ok(regex) => regex,
29 Err(err) if source.is_literal() => return Err(SyntaxError::from(err.description().to_owned()).into()),
30 Err(err) => return Err(RegexpError::from(err.description().to_owned()).into()),
31 };
32 let regexp = Self {
33 source,
34 config,
35 encoding,
36 regex: Rc::new(regex),
37 };
38 Ok(regexp)
39 }
40}
41
42impl fmt::Display for Onig {
43 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
44 let pattern = self.config.pattern();
45 format_debug_escape_into(f, pattern)
46 }
47}
48
49impl RegexpType for Onig {
50 fn box_clone(&self) -> Box<dyn RegexpType> {
51 Box::new(self.clone())
52 }
53
54 fn captures(&self, haystack: &[u8]) -> Result<Option<Vec<NilableString>>, Error> {
55 let haystack = str::from_utf8(haystack)
56 .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
57 if let Some(captures) = self.regex.captures(haystack) {
58 let mut result = Vec::with_capacity(captures.len());
59 for capture in captures.iter() {
60 if let Some(capture) = capture {
61 result.push(Some(capture.into()));
62 } else {
63 result.push(None);
64 }
65 }
66 Ok(Some(result))
67 } else {
68 Ok(None)
69 }
70 }
71
72 fn capture_indexes_for_name(&self, name: &[u8]) -> Result<Option<Vec<usize>>, Error> {
73 let mut result = None;
74 self.regex.foreach_name(|group, group_indexes| {
75 if name != group.as_bytes() {
76 return true;
78 }
79 let mut indexes = Vec::with_capacity(group_indexes.len());
80 for &index in group_indexes {
81 indexes.push(qed::lossless_cast_u32_to_usize!(index));
82 }
83 result = Some(indexes);
84 false
85 });
86 Ok(result)
87 }
88
89 fn captures_len(&self, haystack: Option<&[u8]>) -> Result<usize, Error> {
90 let result = if let Some(haystack) = haystack {
91 let haystack = str::from_utf8(haystack).map_err(|_| {
92 ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks")
93 })?;
94 self.regex
95 .captures(haystack)
96 .map(|captures| captures.len())
97 .unwrap_or_default()
98 } else {
99 self.regex.captures_len()
100 };
101 Ok(result)
102 }
103
104 fn capture0<'a>(&self, haystack: &'a [u8]) -> Result<Option<&'a [u8]>, Error> {
105 let haystack = str::from_utf8(haystack)
106 .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
107 let result = self
108 .regex
109 .captures(haystack)
110 .and_then(|captures| captures.at(0))
111 .map(str::as_bytes);
112 Ok(result)
113 }
114
115 fn debug(&self) -> String {
116 let mut debug = String::from("/");
117 let mut pattern = String::new();
118 let _ = format_debug_escape_into(&mut pattern, self.source.pattern());
125 debug.push_str(pattern.replace('/', r"\/").as_str());
126 debug.push('/');
127 debug.push_str(self.source.options().as_display_modifier());
128 debug.push_str(self.encoding.as_modifier_str());
129 debug
130 }
131
132 fn source(&self) -> &Source {
133 &self.source
134 }
135
136 fn config(&self) -> &Config {
137 &self.config
138 }
139
140 fn encoding(&self) -> &Encoding {
141 &self.encoding
142 }
143
144 fn inspect(&self) -> Vec<u8> {
145 let mut inspect = Vec::with_capacity(self.source.pattern().len() + 2 + 4);
147 inspect.push(b'/');
148 if let Ok(pat) = str::from_utf8(self.source.pattern()) {
149 inspect.extend_from_slice(pat.replace('/', r"\/").as_bytes());
150 } else {
151 inspect.extend_from_slice(self.source.pattern());
152 }
153 inspect.push(b'/');
154 inspect.extend_from_slice(self.source.options().as_display_modifier().as_bytes());
155 inspect.extend_from_slice(self.encoding.as_modifier_str().as_bytes());
156 inspect
157 }
158
159 fn string(&self) -> &[u8] {
160 self.config.pattern()
161 }
162
163 fn case_match(&self, interp: &mut Artichoke, haystack: &[u8]) -> Result<bool, Error> {
164 let haystack = str::from_utf8(haystack)
165 .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
166 regexp::clear_capture_globals(interp)?;
167 if let Some(captures) = self.regex.captures(haystack) {
168 interp.set_capture_group_globals(captures.len())?;
169 let value = interp.try_convert_mut(captures.at(0))?;
170 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
171
172 for group in 0..captures.len() {
173 let value = interp.try_convert_mut(captures.at(group))?;
174 let group = NonZeroUsize::MIN.saturating_add(group);
175 interp.set_global_variable(regexp::nth_match_group(group), &value)?;
176 }
177
178 if let Some(match_pos) = captures.pos(0) {
179 let pre_match = interp.try_convert_mut(&haystack[..match_pos.0])?;
180 let post_match = interp.try_convert_mut(&haystack[match_pos.1..])?;
181 interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
182 interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
183 }
184 let matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
185 let matchdata = MatchData::alloc_value(matchdata, interp)?;
186 interp.set_global_variable(regexp::LAST_MATCH, &matchdata)?;
187 Ok(true)
188 } else {
189 interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
190 interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
191 Ok(false)
192 }
193 }
194
195 fn is_match(&self, haystack: &[u8], pos: Option<i64>) -> Result<bool, Error> {
196 let haystack = str::from_utf8(haystack)
197 .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
198 let haystack_char_len = haystack.chars().count();
199 let pos = pos.unwrap_or_default();
200 let pos = if let Some(pos) = aref::offset_to_index(pos, haystack_char_len) {
201 pos
202 } else {
203 return Ok(false);
204 };
205 let offset = haystack.chars().take(pos).map(char::len_utf8).sum();
206 if let Some(haystack) = haystack.get(offset..) {
207 Ok(self.regex.find(haystack).is_some())
208 } else {
209 Ok(false)
210 }
211 }
212
213 fn match_(
214 &self,
215 interp: &mut Artichoke,
216 haystack: &[u8],
217 pos: Option<i64>,
218 block: Option<Block>,
219 ) -> Result<Value, Error> {
220 let haystack = str::from_utf8(haystack)
221 .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
222 regexp::clear_capture_globals(interp)?;
223 let haystack_char_len = haystack.chars().count();
224 let pos = pos.unwrap_or_default();
225 let pos = if let Some(pos) = aref::offset_to_index(pos, haystack_char_len) {
226 pos
227 } else {
228 return Ok(Value::nil());
229 };
230 let offset = haystack.chars().take(pos).map(char::len_utf8).sum();
231 let target = if let Some(haystack) = haystack.get(offset..) {
232 haystack
233 } else {
234 interp.unset_global_variable(regexp::LAST_MATCH)?;
235 interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
236 interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
237 return Ok(Value::nil());
238 };
239
240 if let Some(captures) = self.regex.captures(target) {
241 interp.set_capture_group_globals(captures.len())?;
242
243 let value = interp.try_convert_mut(captures.at(0))?;
244 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
245 for group in 0..captures.len() {
246 let value = interp.try_convert_mut(captures.at(group))?;
247 let group = NonZeroUsize::MIN.saturating_add(group);
248 interp.set_global_variable(regexp::nth_match_group(group), &value)?;
249 }
250
251 let mut matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
252 if let Some(match_pos) = captures.pos(0) {
253 let pre_match = interp.try_convert_mut(&target[..match_pos.0])?;
254 let post_match = interp.try_convert_mut(&target[match_pos.1..])?;
255 interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
256 interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
257 matchdata.set_region(offset + match_pos.0..offset + match_pos.1);
258 }
259 let data = MatchData::alloc_value(matchdata, interp)?;
260 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
261 if let Some(block) = block {
262 let result = block.yield_arg(interp, &data)?;
263 Ok(result)
264 } else {
265 Ok(data)
266 }
267 } else {
268 interp.unset_global_variable(regexp::LAST_MATCH)?;
269 interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
270 interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
271 Ok(Value::nil())
272 }
273 }
274
275 fn match_operator(&self, interp: &mut Artichoke, haystack: &[u8]) -> Result<Option<usize>, Error> {
276 let haystack = str::from_utf8(haystack)
277 .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
278 regexp::clear_capture_globals(interp)?;
279 if let Some(captures) = self.regex.captures(haystack) {
280 interp.set_capture_group_globals(captures.len())?;
281
282 let value = interp.try_convert_mut(captures.at(0))?;
283 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
284 for group in 0..captures.len() {
285 let value = interp.try_convert_mut(captures.at(group))?;
286 let group = NonZeroUsize::MIN.saturating_add(group);
287 interp.set_global_variable(regexp::nth_match_group(group), &value)?;
288 }
289
290 let matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
291 let data = MatchData::alloc_value(matchdata, interp)?;
292 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
293 if let Some(match_pos) = captures.pos(0) {
294 let pre_match = interp.try_convert_mut(&haystack[..match_pos.0])?;
295 let post_match = interp.try_convert_mut(&haystack[match_pos.1..])?;
296 interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
297 interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
298 let pos = match_pos.0;
299 Ok(Some(pos))
300 } else {
301 Ok(Some(0))
302 }
303 } else {
304 interp.unset_global_variable(regexp::LAST_MATCH)?;
305 interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
306 interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
307 Ok(None)
308 }
309 }
310
311 fn named_captures(&self) -> Result<NameToCaptureLocations, Error> {
312 let mut map = vec![];
315 self.regex.foreach_name(|group, group_indexes| {
316 let mut converted = Vec::with_capacity(group_indexes.len());
317 for &index in group_indexes {
318 converted.push(qed::lossless_cast_u32_to_usize!(index));
319 }
320 map.push((group.into(), converted));
321 true
322 });
323 Ok(map)
324 }
325
326 fn named_captures_for_haystack(&self, haystack: &[u8]) -> Result<Option<HashMap<Vec<u8>, NilableString>>, Error> {
327 let haystack = str::from_utf8(haystack)
328 .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
329 if let Some(captures) = self.regex.captures(haystack) {
330 let mut map = HashMap::with_capacity(captures.len());
331 self.regex.foreach_name(|group, group_indexes| {
332 for &index in group_indexes.iter().rev() {
333 if let Some(capture) = captures.at(qed::lossless_cast_u32_to_usize!(index)) {
334 map.insert(group.into(), Some(capture.into()));
335 return true;
336 }
337 }
338 map.insert(group.into(), None);
339 true
340 });
341 Ok(Some(map))
342 } else {
343 Ok(None)
344 }
345 }
346
347 fn names(&self) -> Vec<Vec<u8>> {
348 let mut names = vec![];
349 let mut capture_names = vec![];
350 self.regex.foreach_name(|group, group_indexes| {
351 capture_names.push((group.as_bytes().to_vec(), group_indexes.to_vec()));
352 true
353 });
354 capture_names.sort_by(|left, right| {
355 let left = left.1.iter().min().copied().unwrap_or(u32::MAX);
356 let right = right.1.iter().min().copied().unwrap_or(u32::MAX);
357 left.cmp(&right)
358 });
359 for (name, _) in capture_names {
360 if !names.contains(&name) {
361 names.push(name);
362 }
363 }
364 names
365 }
366
367 fn pos(&self, haystack: &[u8], at: usize) -> Result<Option<(usize, usize)>, Error> {
368 let haystack = str::from_utf8(haystack)
369 .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
370 let pos = self.regex.captures(haystack).and_then(|captures| captures.pos(at));
371 Ok(pos)
372 }
373
374 fn scan(&self, interp: &mut Artichoke, haystack: &[u8], block: Option<Block>) -> Result<Scan, Error> {
375 let haystack = str::from_utf8(haystack)
376 .map_err(|_| ArgumentError::with_message("Oniguruma backend for Regexp only supports UTF-8 haystacks"))?;
377 regexp::clear_capture_globals(interp)?;
378 let mut matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
379
380 let len = NonZeroUsize::new(self.regex.captures_len());
381 if let Some(block) = block {
382 if let Some(len) = len {
383 interp.set_capture_group_globals(len.get())?;
384
385 let mut iter = self.regex.captures_iter(haystack).peekable();
386 if iter.peek().is_none() {
387 interp.unset_global_variable(regexp::LAST_MATCH)?;
388 return Ok(Scan::Haystack);
389 }
390 for captures in iter {
391 let fullcapture = interp.try_convert_mut(captures.at(0))?;
392 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &fullcapture)?;
393
394 let mut groups = Vec::with_capacity(len.get());
395 for group in 1..=len.get() {
396 let capture = captures.at(group);
397 groups.push(capture);
398 let capture = interp.try_convert_mut(capture)?;
399 let group = unsafe { NonZeroUsize::new_unchecked(group) };
400 interp.set_global_variable(regexp::nth_match_group(group), &capture)?;
401 }
402
403 let matched = interp.try_convert_mut(groups)?;
404 if let Some(pos) = captures.pos(0) {
405 matchdata.set_region(pos.0..pos.1);
406 }
407 let data = MatchData::alloc_value(matchdata.clone(), interp)?;
408 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
409 block.yield_arg(interp, &matched)?;
410 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
411 }
412 } else {
413 let mut iter = self.regex.find_iter(haystack).peekable();
414 if iter.peek().is_none() {
415 interp.unset_global_variable(regexp::LAST_MATCH)?;
416 return Ok(Scan::Haystack);
417 }
418 for pos in iter {
419 let scanned = &haystack[pos.0..pos.1];
420 let matched = interp.try_convert_mut(scanned)?;
421 matchdata.set_region(pos.0..pos.1);
422 let data = MatchData::alloc_value(matchdata.clone(), interp)?;
423 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
424 block.yield_arg(interp, &matched)?;
425 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
426 }
427 }
428 Ok(Scan::Haystack)
429 } else {
430 let mut last_pos = (0, 0);
431 if let Some(len) = len {
432 interp.set_capture_group_globals(len.get())?;
433
434 let mut collected = vec![];
435 let mut iter = self.regex.captures_iter(haystack).peekable();
436 if iter.peek().is_none() {
437 interp.unset_global_variable(regexp::LAST_MATCH)?;
438 return Ok(Scan::Collected(Vec::new()));
439 }
440 for captures in iter {
441 let mut groups = Vec::with_capacity(len.get());
442 for group in 1..=len.get() {
443 groups.push(captures.at(group).map(str::as_bytes).map(Vec::from));
444 }
445
446 if let Some(pos) = captures.pos(0) {
447 last_pos = pos;
448 }
449 collected.push(groups);
450 }
451 matchdata.set_region(last_pos.0..last_pos.1);
452 let data = MatchData::alloc_value(matchdata, interp)?;
453 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
454
455 let mut iter = collected.iter().enumerate();
456 if let Some((_, fullcapture)) = iter.next() {
457 let fullcapture = interp.try_convert_mut(fullcapture.as_slice())?;
458 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &fullcapture)?;
459 }
460 for (group, capture) in iter {
461 let capture = interp.try_convert_mut(capture.as_slice())?;
462 let group = unsafe { NonZeroUsize::new_unchecked(group) };
463 interp.set_global_variable(regexp::nth_match_group(group), &capture)?;
464 }
465 Ok(Scan::Collected(collected))
466 } else {
467 let mut collected = vec![];
468 let mut iter = self.regex.find_iter(haystack).peekable();
469 if iter.peek().is_none() {
470 interp.unset_global_variable(regexp::LAST_MATCH)?;
471 return Ok(Scan::Patterns(Vec::new()));
472 }
473 for pos in iter {
474 let scanned = &haystack[pos.0..pos.1];
475 last_pos = pos;
476 collected.push(Vec::from(scanned.as_bytes()));
477 }
478 matchdata.set_region(last_pos.0..last_pos.1);
479 let data = MatchData::alloc_value(matchdata, interp)?;
480 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
481
482 let last_matched = collected.last().map(Vec::as_slice);
483 let last_matched = interp.try_convert_mut(last_matched)?;
484 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &last_matched)?;
485 Ok(Scan::Patterns(collected))
486 }
487 }
488 }
489}