artichoke_backend/extn/core/regexp/backend/regex/
utf8.rs1use std::collections::HashMap;
2use std::fmt;
3use std::num::NonZeroUsize;
4use std::str;
5
6use regex::{Match, Regex, RegexBuilder};
7use scolapasta_string_escape::format_debug_escape_into;
8
9use super::super::{NameToCaptureLocations, NilableString};
10use crate::extn::core::matchdata::MatchData;
11use crate::extn::core::regexp::{self, Config, Encoding, Regexp, RegexpType, Scan, Source};
12use crate::extn::prelude::*;
13
14#[derive(Debug, Clone)]
15pub struct Utf8 {
16 source: Source,
17 config: Config,
18 encoding: Encoding,
19 regex: Regex,
20}
21
22impl Utf8 {
23 pub fn new(source: Source, config: Config, encoding: Encoding) -> Result<Self, Error> {
24 let pattern = str::from_utf8(config.pattern()).map_err(|_| {
25 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 patterns")
26 })?;
27
28 let mut builder = RegexBuilder::new(pattern);
29 builder.case_insensitive(config.options().ignore_case().into());
30 builder.multi_line(config.options().multiline().into());
31 builder.ignore_whitespace(config.options().extended().into());
32
33 let regex = match builder.build() {
34 Ok(regex) => regex,
35 Err(err) if source.is_literal() => {
36 return Err(SyntaxError::from(err.to_string()).into());
37 }
38 Err(err) => return Err(RegexpError::from(err.to_string()).into()),
39 };
40 let regexp = Self {
41 source,
42 config,
43 encoding,
44 regex,
45 };
46 Ok(regexp)
47 }
48}
49
50impl fmt::Display for Utf8 {
51 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
52 let pattern = self.config.pattern();
53 format_debug_escape_into(f, pattern)
54 }
55}
56
57impl RegexpType for Utf8 {
58 fn box_clone(&self) -> Box<dyn RegexpType> {
59 Box::new(self.clone())
60 }
61
62 fn captures(&self, haystack: &[u8]) -> Result<Option<Vec<NilableString>>, Error> {
63 let haystack = str::from_utf8(haystack).map_err(|_| {
64 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
65 })?;
66 if let Some(captures) = self.regex.captures(haystack) {
67 let mut result = Vec::with_capacity(captures.len());
68 for capture in captures.iter() {
69 if let Some(capture) = capture {
70 result.push(Some(capture.as_str().into()));
71 } else {
72 result.push(None);
73 }
74 }
75 Ok(Some(result))
76 } else {
77 Ok(None)
78 }
79 }
80
81 fn capture_indexes_for_name(&self, name: &[u8]) -> Result<Option<Vec<usize>>, Error> {
82 let mut result = vec![];
83 for (index, group) in self.regex.capture_names().enumerate() {
84 if Some(name) == group.map(str::as_bytes) {
85 result.push(index);
86 }
87 }
88 if result.is_empty() { Ok(None) } else { Ok(Some(result)) }
89 }
90
91 fn captures_len(&self, haystack: Option<&[u8]>) -> Result<usize, Error> {
92 let result = if let Some(haystack) = haystack {
93 let haystack = str::from_utf8(haystack).map_err(|_| {
94 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
95 })?;
96 self.regex
97 .captures(haystack)
98 .map(|captures| captures.len())
99 .unwrap_or_default()
100 } else {
101 self.regex.captures_len()
102 };
103 Ok(result)
104 }
105
106 fn capture0<'a>(&self, haystack: &'a [u8]) -> Result<Option<&'a [u8]>, Error> {
107 let haystack = str::from_utf8(haystack).map_err(|_| {
108 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
109 })?;
110 let result = self
111 .regex
112 .captures(haystack)
113 .and_then(|captures| captures.get(0))
114 .as_ref()
115 .map(Match::as_str)
116 .map(str::as_bytes);
117 Ok(result)
118 }
119
120 fn debug(&self) -> String {
121 let mut debug = String::from("/");
122 let mut pattern = String::new();
123 let _ = format_debug_escape_into(&mut pattern, self.source.pattern());
130 debug.push_str(pattern.replace('/', r"\/").as_str());
131 debug.push('/');
132 debug.push_str(self.source.options().as_display_modifier());
133 debug.push_str(self.encoding.as_modifier_str());
134 debug
135 }
136
137 fn source(&self) -> &Source {
138 &self.source
139 }
140
141 fn config(&self) -> &Config {
142 &self.config
143 }
144
145 fn encoding(&self) -> &Encoding {
146 &self.encoding
147 }
148
149 fn inspect(&self) -> Vec<u8> {
150 let mut inspect = Vec::with_capacity(self.source.pattern().len() + 2 + 4);
152 inspect.push(b'/');
153 if let Ok(pat) = str::from_utf8(self.source.pattern()) {
154 inspect.extend_from_slice(pat.replace('/', r"\/").as_bytes());
155 } else {
156 inspect.extend_from_slice(self.source.pattern());
157 }
158 inspect.push(b'/');
159 inspect.extend_from_slice(self.source.options().as_display_modifier().as_bytes());
160 inspect.extend_from_slice(self.encoding.as_modifier_str().as_bytes());
161 inspect
162 }
163
164 fn string(&self) -> &[u8] {
165 self.config.pattern()
166 }
167
168 fn case_match(&self, interp: &mut Artichoke, haystack: &[u8]) -> Result<bool, Error> {
169 let haystack = str::from_utf8(haystack).map_err(|_| {
170 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystack")
171 })?;
172 regexp::clear_capture_globals(interp)?;
173 if let Some(captures) = self.regex.captures(haystack) {
174 interp.set_capture_group_globals(captures.len().checked_sub(1).unwrap_or_default())?;
181
182 let fullmatch = captures.get(0).as_ref().map(Match::as_str).map(str::as_bytes);
183 let value = interp.try_convert_mut(fullmatch)?;
184 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
185 for group in 1..captures.len() {
186 let capture = captures.get(group).as_ref().map(Match::as_str).map(str::as_bytes);
187 let value = interp.try_convert_mut(capture)?;
188 let group = unsafe { NonZeroUsize::new_unchecked(group) };
189 interp.set_global_variable(regexp::nth_match_group(group), &value)?;
190 }
191
192 if let Some(match_pos) = captures.get(0) {
193 let pre_match = interp.try_convert_mut(&haystack[..match_pos.start()])?;
194 let post_match = interp.try_convert_mut(&haystack[match_pos.end()..])?;
195 interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
196 interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
197 }
198 let matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
199 let matchdata = MatchData::alloc_value(matchdata, interp)?;
200 interp.set_global_variable(regexp::LAST_MATCH, &matchdata)?;
201 Ok(true)
202 } else {
203 interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
204 interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
205 Ok(false)
206 }
207 }
208
209 fn is_match(&self, haystack: &[u8], pos: Option<i64>) -> Result<bool, Error> {
210 let haystack = str::from_utf8(haystack).map_err(|_| {
211 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystack")
212 })?;
213 let haystack_char_len = haystack.chars().count();
214 let pos = pos.unwrap_or_default();
215 let pos = if let Some(pos) = aref::offset_to_index(pos, haystack_char_len) {
216 pos
217 } else {
218 return Ok(false);
219 };
220 let offset = haystack.chars().take(pos).map(char::len_utf8).sum();
221 if let Some(haystack) = haystack.get(offset..) {
222 Ok(self.regex.find(haystack).is_some())
223 } else {
224 Ok(false)
225 }
226 }
227
228 fn match_(
229 &self,
230 interp: &mut Artichoke,
231 haystack: &[u8],
232 pos: Option<i64>,
233 block: Option<Block>,
234 ) -> Result<Value, Error> {
235 let haystack = str::from_utf8(haystack).map_err(|_| {
236 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
237 })?;
238 regexp::clear_capture_globals(interp)?;
239 let haystack_char_len = haystack.chars().count();
240 let pos = pos.unwrap_or_default();
241 let pos = if let Some(pos) = aref::offset_to_index(pos, haystack_char_len) {
242 pos
243 } else {
244 return Ok(Value::nil());
245 };
246 let offset = haystack.chars().take(pos).map(char::len_utf8).sum();
247 let target = if let Some(haystack) = haystack.get(offset..) {
248 haystack
249 } else {
250 interp.unset_global_variable(regexp::LAST_MATCH)?;
251 interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
252 interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
253 return Ok(Value::nil());
254 };
255 if let Some(captures) = self.regex.captures(target) {
256 interp.set_capture_group_globals(captures.len().checked_sub(1).unwrap_or_default())?;
263
264 let fullmatch = captures.get(0).as_ref().map(Match::as_str).map(str::as_bytes);
265 let value = interp.try_convert_mut(fullmatch)?;
266 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
267 for group in 1..captures.len() {
268 let capture = captures.get(group).as_ref().map(Match::as_str).map(str::as_bytes);
269 let value = interp.try_convert_mut(capture)?;
270 let group = unsafe { NonZeroUsize::new_unchecked(group) };
271 interp.set_global_variable(regexp::nth_match_group(group), &value)?;
272 }
273
274 let mut matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
275 if let Some(match_pos) = captures.get(0) {
276 let pre_match = interp.try_convert_mut(&target[..match_pos.start()])?;
277 let post_match = interp.try_convert_mut(&target[match_pos.end()..])?;
278 interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
279 interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
280 matchdata.set_region(offset + match_pos.start()..offset + match_pos.end());
281 }
282 let data = MatchData::alloc_value(matchdata, interp)?;
283 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
284 if let Some(block) = block {
285 let result = block.yield_arg(interp, &data)?;
286 Ok(result)
287 } else {
288 Ok(data)
289 }
290 } else {
291 interp.unset_global_variable(regexp::LAST_MATCH)?;
292 interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
293 interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
294 Ok(Value::nil())
295 }
296 }
297
298 fn match_operator(&self, interp: &mut Artichoke, haystack: &[u8]) -> Result<Option<usize>, Error> {
299 let haystack = str::from_utf8(haystack).map_err(|_| {
300 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
301 })?;
302 regexp::clear_capture_globals(interp)?;
303 if let Some(captures) = self.regex.captures(haystack) {
304 interp.set_capture_group_globals(captures.len().checked_sub(1).unwrap_or_default())?;
311
312 let fullmatch = captures.get(0).as_ref().map(Match::as_str).map(str::as_bytes);
313 let value = interp.try_convert_mut(fullmatch)?;
314 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &value)?;
315 for group in 1..captures.len() {
316 let capture = captures.get(group).as_ref().map(Match::as_str).map(str::as_bytes);
317 let value = interp.try_convert_mut(capture)?;
318 let group = unsafe { NonZeroUsize::new_unchecked(group) };
319 interp.set_global_variable(regexp::nth_match_group(group), &value)?;
320 }
321
322 let matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
323 let data = MatchData::alloc_value(matchdata, interp)?;
324 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
325 if let Some(match_pos) = captures.get(0) {
326 let pre_match = interp.try_convert_mut(&haystack[..match_pos.start()])?;
327 let post_match = interp.try_convert_mut(&haystack[match_pos.end()..])?;
328 interp.set_global_variable(regexp::STRING_LEFT_OF_MATCH, &pre_match)?;
329 interp.set_global_variable(regexp::STRING_RIGHT_OF_MATCH, &post_match)?;
330 let pos = match_pos.start();
331 Ok(Some(pos))
332 } else {
333 Ok(Some(0))
334 }
335 } else {
336 interp.unset_global_variable(regexp::LAST_MATCH)?;
337 interp.unset_global_variable(regexp::STRING_LEFT_OF_MATCH)?;
338 interp.unset_global_variable(regexp::STRING_RIGHT_OF_MATCH)?;
339 Ok(None)
340 }
341 }
342
343 fn named_captures(&self) -> Result<NameToCaptureLocations, Error> {
344 let mut map = vec![];
347 for group in self.regex.capture_names().flatten() {
348 if let Some(indexes) = self.capture_indexes_for_name(group.as_bytes())? {
349 map.push((group.into(), indexes));
350 }
351 }
352 Ok(map)
353 }
354
355 fn named_captures_for_haystack(&self, haystack: &[u8]) -> Result<Option<HashMap<Vec<u8>, NilableString>>, Error> {
356 let haystack = str::from_utf8(haystack).map_err(|_| {
357 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
358 })?;
359 if let Some(captures) = self.regex.captures(haystack) {
360 let mut map = HashMap::with_capacity(captures.len());
361 for (group, group_indexes) in self.named_captures()? {
362 let capture = group_indexes
363 .iter()
364 .rev()
365 .copied()
366 .find_map(|index| captures.get(index));
367 if let Some(capture) = capture {
368 map.insert(group, Some(capture.as_str().into()));
369 } else {
370 map.insert(group, None);
371 }
372 }
373 Ok(Some(map))
374 } else {
375 Ok(None)
376 }
377 }
378
379 fn names(&self) -> Vec<Vec<u8>> {
380 let mut names = vec![];
381 let mut capture_names = self.named_captures().unwrap_or_default();
382 capture_names.sort_by(|left, right| {
383 let left = left.1.iter().min().copied().unwrap_or(usize::MAX);
384 let right = right.1.iter().min().copied().unwrap_or(usize::MAX);
385 left.cmp(&right)
386 });
387 for (name, _) in capture_names {
388 if !names.contains(&name) {
389 names.push(name);
390 }
391 }
392 names
393 }
394
395 fn pos(&self, haystack: &[u8], at: usize) -> Result<Option<(usize, usize)>, Error> {
396 let haystack = str::from_utf8(haystack).map_err(|_| {
397 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
398 })?;
399 let pos = self
400 .regex
401 .captures(haystack)
402 .and_then(|captures| captures.get(at))
403 .map(|match_pos| (match_pos.start(), match_pos.end()));
404 Ok(pos)
405 }
406
407 fn scan(&self, interp: &mut Artichoke, haystack: &[u8], block: Option<Block>) -> Result<Scan, Error> {
408 let haystack = str::from_utf8(haystack).map_err(|_| {
409 ArgumentError::with_message("regex crate utf8 backend for Regexp only supports UTF-8 haystacks")
410 })?;
411 regexp::clear_capture_globals(interp)?;
412 let mut matchdata = MatchData::new(haystack.into(), Regexp::from(self.box_clone()), ..);
413
414 let len = self.regex.captures_len().checked_sub(1);
416 interp.set_capture_group_globals(len.unwrap_or_default())?;
417 let len = len.and_then(NonZeroUsize::new);
418 if let Some(block) = block {
419 if let Some(len) = len {
420 let mut iter = self.regex.captures_iter(haystack).peekable();
421 if iter.peek().is_none() {
422 interp.unset_global_variable(regexp::LAST_MATCH)?;
423 return Ok(Scan::Haystack);
424 }
425 for captures in iter {
426 let matched = captures.get(0).as_ref().map(Match::as_str).map(str::as_bytes);
427 let capture = interp.try_convert_mut(matched)?;
428 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &capture)?;
429
430 let mut groups = Vec::with_capacity(len.get() - 1);
431 for group in 1..=len.get() {
432 let matched = captures.get(group).as_ref().map(Match::as_str).map(str::as_bytes);
433 let capture = interp.try_convert_mut(matched)?;
434 let group = unsafe { NonZeroUsize::new_unchecked(group) };
435 interp.set_global_variable(regexp::nth_match_group(group), &capture)?;
436 groups.push(matched);
437 }
438
439 let matched = interp.try_convert_mut(groups)?;
440 if let Some(pos) = captures.get(0) {
441 matchdata.set_region(pos.start()..pos.end());
442 }
443 let data = MatchData::alloc_value(matchdata.clone(), interp)?;
444 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
445 block.yield_arg(interp, &matched)?;
446 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
447 }
448 } else {
449 let mut iter = self.regex.find_iter(haystack).peekable();
450 if iter.peek().is_none() {
451 interp.unset_global_variable(regexp::LAST_MATCH)?;
452 return Ok(Scan::Haystack);
453 }
454 for pos in iter {
455 let scanned = &haystack[pos.start()..pos.end()];
456 let matched = interp.try_convert_mut(scanned)?;
457 matchdata.set_region(pos.start()..pos.end());
458 let data = MatchData::alloc_value(matchdata.clone(), interp)?;
459 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
460 block.yield_arg(interp, &matched)?;
461 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
462 }
463 }
464 Ok(Scan::Haystack)
465 } else {
466 let mut last_pos = (0, 0);
467 if let Some(len) = len {
468 let mut collected = vec![];
469 let mut iter = self.regex.captures_iter(haystack).peekable();
470 if iter.peek().is_none() {
471 interp.unset_global_variable(regexp::LAST_MATCH)?;
472 return Ok(Scan::Collected(Vec::new()));
473 }
474 for captures in iter {
475 let mut groups = Vec::with_capacity(len.get() - 1);
476 for group in 1..=len.get() {
477 let matched = captures
478 .get(group)
479 .as_ref()
480 .map(Match::as_str)
481 .map(str::as_bytes)
482 .map(Vec::from);
483 groups.push(matched);
484 }
485
486 if let Some(pos) = captures.get(0) {
487 last_pos = (pos.start(), pos.end());
488 }
489 collected.push(groups);
490 }
491 matchdata.set_region(last_pos.0..last_pos.1);
492 let data = MatchData::alloc_value(matchdata, interp)?;
493 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
494 let mut iter = collected.iter().enumerate();
495 if let Some((_, fullcapture)) = iter.next() {
496 let fullcapture = interp.try_convert_mut(fullcapture.as_slice())?;
497 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &fullcapture)?;
498 }
499 for (group, capture) in iter {
500 let capture = interp.try_convert_mut(capture.as_slice())?;
501 let group = unsafe { NonZeroUsize::new_unchecked(group) };
502 interp.set_global_variable(regexp::nth_match_group(group), &capture)?;
503 }
504 Ok(Scan::Collected(collected))
505 } else {
506 let mut collected = vec![];
507 let mut iter = self.regex.find_iter(haystack).peekable();
508 if iter.peek().is_none() {
509 interp.unset_global_variable(regexp::LAST_MATCH)?;
510 return Ok(Scan::Patterns(Vec::new()));
511 }
512 for pos in iter {
513 let scanned = &haystack[pos.start()..pos.end()];
514 last_pos = (pos.start(), pos.end());
515 collected.push(Vec::from(scanned.as_bytes()));
516 }
517 matchdata.set_region(last_pos.0..last_pos.1);
518 let data = MatchData::alloc_value(matchdata, interp)?;
519 interp.set_global_variable(regexp::LAST_MATCH, &data)?;
520 let last_matched = collected.last().map(Vec::as_slice);
521 let last_matched = interp.try_convert_mut(last_matched)?;
522 interp.set_global_variable(regexp::LAST_MATCHED_STRING, &last_matched)?;
523 Ok(Scan::Patterns(collected))
524 }
525 }
526 }
527}