1use core::fmt;
2use core::str;
3use std::collections::HashSet;
4
5use regex::{Match, Regex, RegexBuilder};
6use scolapasta_string_escape::format_debug_escape_into;
7
8use crate::debug::Debug;
9use crate::encoding::Encoding;
10use crate::error::{ArgumentError, Error, RegexpError, SyntaxError};
11use crate::named_captures::{NamedCapture, NamedCaptures, NamedCapturesForHaystack};
12use crate::{Config, Source};
13
14mod iter;
15
16pub use iter::{CaptureIndices, Captures};
17
18#[derive(Debug, Clone)]
19pub struct Utf8 {
20 source: Source,
21 config: Config,
22 encoding: Encoding,
23 regex: Regex,
24}
25
26impl fmt::Display for Utf8 {
27 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
28 let pattern = self.config.pattern();
29 format_debug_escape_into(f, pattern)?;
30 Ok(())
31 }
32}
33
34impl Utf8 {
35 pub fn with_literal_derived_encoding(source: Source, config: Config, encoding: Encoding) -> Result<Self, Error> {
67 let pattern = str::from_utf8(config.pattern()).map_err(|_| ArgumentError::unsupported_pattern_encoding())?;
68 let mut builder = RegexBuilder::new(pattern);
69 builder.case_insensitive(config.options.ignore_case().is_enabled());
70 builder.multi_line(config.options.multiline().is_enabled());
71 builder.ignore_whitespace(config.options.extended().is_enabled());
72
73 let regex = match builder.build() {
74 Ok(regex) => regex,
75 Err(err) if source.options.is_literal() => {
76 return Err(SyntaxError::from(err.to_string()).into());
77 }
78 Err(err) => return Err(RegexpError::from(err.to_string()).into()),
79 };
80 let regexp = Self {
81 source,
82 config,
83 encoding,
84 regex,
85 };
86 Ok(regexp)
87 }
88
89 pub fn captures<'a>(&self, haystack: &'a [u8]) -> Result<Option<Captures<'a>>, Error> {
93 let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
94 Ok(self.regex.captures(haystack).map(Captures::from))
95 }
96
97 pub fn capture_indices_for_name<'a, 'b>(&'a self, name: &'b [u8]) -> CaptureIndices<'a, 'b> {
98 CaptureIndices::with_name_and_iter(name, self.regex.capture_names())
99 }
100
101 #[must_use]
103 pub fn captures_len(&self) -> usize {
104 self.regex.captures_len()
105 }
106
107 pub fn capture_count_for_haystack(&self, haystack: &[u8]) -> Result<usize, ArgumentError> {
118 let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
119 if let Some(captures) = self.regex.captures(haystack) {
120 Ok(captures.len())
121 } else {
122 Ok(0)
123 }
124 }
125
126 pub fn entire_match<'a>(&self, haystack: &'a [u8]) -> Result<Option<&'a [u8]>, Error> {
134 let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
135 if let Some(captures) = self.regex.captures(haystack) {
136 let entire_match = captures.get(0);
137 Ok(entire_match.as_ref().map(Match::as_str).map(str::as_bytes))
138 } else {
139 Ok(None)
140 }
141 }
142
143 pub fn named_captures(&self) -> NamedCaptures {
149 let mut map = vec![];
152 for group in self.regex.capture_names().flatten() {
153 let indices = self.capture_indices_for_name(group.as_bytes()).collect::<Vec<_>>();
154 if !indices.is_empty() {
155 map.push(NamedCapture::new(group.into(), indices));
156 }
157 }
158 map.into()
159 }
160
161 pub fn named_captures_for_haystack(&self, haystack: &[u8]) -> Result<Option<NamedCapturesForHaystack>, Error> {
165 let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
166 let captures = if let Some(captures) = self.regex.captures(haystack) {
167 captures
168 } else {
169 return Ok(None);
170 };
171 let mut map = NamedCapturesForHaystack::with_capacity(captures.len());
172 for named_capture in self.named_captures() {
173 let (group, indices) = named_capture.into_group_and_indices();
174 let capture = indices.iter().rev().copied().find_map(|index| captures.get(index));
175 if let Some(capture) = capture {
176 map.insert(group, Some(capture.as_str().into()));
177 } else {
178 map.insert(group, None);
179 }
180 }
181 Ok(Some(map))
182 }
183
184 #[must_use]
185 pub fn names(&self) -> Vec<Vec<u8>> {
186 let mut names = vec![];
187 let mut capture_names = self.named_captures().collect::<Vec<_>>();
188 capture_names.sort_by(|left, right| {
189 let left = left.indices().iter().min().copied().unwrap_or(usize::MAX);
190 let right = right.indices().iter().min().copied().unwrap_or(usize::MAX);
191 left.cmp(&right)
192 });
193 let mut set = HashSet::with_capacity(capture_names.len());
194 for cn in capture_names {
195 let name = cn.into_group();
196 if set.contains(&name) {
197 continue;
198 }
199 names.push(name.clone());
200 set.insert(name);
201 }
202 names
203 }
204
205 pub fn pos(&self, haystack: &[u8], at: usize) -> Result<Option<(usize, usize)>, Error> {
209 let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
210 let pos = self
211 .regex
212 .captures(haystack)
213 .and_then(|captures| captures.get(at))
214 .map(|match_pos| (match_pos.start(), match_pos.end()));
215 Ok(pos)
216 }
217
218 pub fn is_match(&self, haystack: &[u8], pos: Option<i64>) -> Result<bool, Error> {
227 let haystack = str::from_utf8(haystack).map_err(|_| ArgumentError::unsupported_haystack_encoding())?;
228 let haystack_char_len = haystack.chars().count();
229 let pos = pos.unwrap_or_default();
230 let pos = if let Some(pos) = scolapasta_aref::offset_to_index(pos, haystack_char_len) {
231 pos
232 } else {
233 return Ok(false);
234 };
235 let offset = haystack.chars().take(pos).map(char::len_utf8).sum();
236 let haystack = &haystack[offset..];
237 Ok(self.regex.find(haystack).is_some())
238 }
239
240 pub fn debug(&self) -> Debug<'_> {
241 Debug::new(
242 self.source.pattern(),
243 self.source.options.as_display_modifier(),
244 self.encoding.as_modifier_str(),
245 )
246 }
247
248 #[must_use]
249 pub fn is_literal(&self) -> bool {
250 self.source.options().is_literal()
251 }
252
253 #[must_use]
254 pub fn source(&self) -> &Source {
255 &self.source
256 }
257
258 #[must_use]
259 pub fn config(&self) -> &Config {
260 &self.config
261 }
262
263 #[must_use]
264 pub fn encoding(&self) -> Encoding {
265 self.encoding
266 }
267
268 #[must_use]
269 pub fn string(&self) -> &[u8] {
270 self.config.pattern()
271 }
272}
273
274#[cfg(test)]
275mod tests {
276 use bstr::{B, ByteSlice};
277
278 use super::Utf8;
279 use crate::{Config, Encoding, Error, Flags, Options, Source};
280
281 fn make(pattern: impl AsRef<[u8]>, options: Option<Options>, encoding: Encoding) -> Utf8 {
282 let source = Source::with_pattern_and_options(pattern.as_ref().to_vec(), options.unwrap_or_default());
283 let config = Config::from(&source);
284 Utf8::with_literal_derived_encoding(source, config, encoding).unwrap()
285 }
286
287 #[test]
288 fn can_compile_posix_character_classes() {
289 let regexp = make("[[:digit:]][[:space:]][[:alpha:]][[:punct:]]", None, Encoding::None);
290 assert!(regexp.is_match(b"1 a&", None).unwrap());
291 }
292
293 #[test]
294 fn can_compile_perl_unicode_patterns() {
295 let regexp = make(r"\d+ \d+", None, Encoding::None);
296 let haystack = "123 १०೩೬";
305 assert!(regexp.is_match(haystack.as_bytes(), None).unwrap());
306 }
307
308 #[test]
309 fn requires_utf8_encoding_for_pattern() {
310 let source = Source::with_pattern_and_options(b"abc \xFF\xFE 123".to_vec(), Options::default());
311 let config = Config::from(&source);
312 let err = Utf8::with_literal_derived_encoding(source, config, Encoding::None).unwrap_err();
313 assert!(matches!(err, Error::Argument(err) if err.message() == "Unsupported pattern encoding"));
314 }
315
316 #[test]
317 fn invalid_pattern_is_syntax_error_for_literal() {
318 let options = Options::from(Flags::LITERAL);
319 let source = Source::with_pattern_and_options(b"[".to_vec(), options);
320 let config = Config::from(&source);
321 let err = Utf8::with_literal_derived_encoding(source, config, Encoding::None).unwrap_err();
322 assert!(matches!(err, Error::Syntax(..)));
323 }
324
325 #[test]
326 fn invalid_pattern_is_syntax_error_for_compiled() {
327 let options = Options::from(Flags::ALL_REGEXP_OPTS);
328 let source = Source::with_pattern_and_options(b"[".to_vec(), options);
329 let config = Config::from(&source);
330 let err = Utf8::with_literal_derived_encoding(source, config, Encoding::None).unwrap_err();
331 assert!(matches!(err, Error::Regexp(..)));
332 }
333
334 #[test]
335 fn literal_pattern_backrefs_are_not_supported() {
336 let options = Options::from(Flags::LITERAL);
337 let source = Source::with_pattern_and_options(br"\0".to_vec(), options);
338 let config = Config::from(&source);
339 let err = Utf8::with_literal_derived_encoding(source, config, Encoding::None).unwrap_err();
340 assert!(matches!(err, Error::Syntax(err) if err.message().contains("backreferences are not supported")));
341 }
342
343 #[test]
344 fn compiled_pattern_backrefs_are_not_supported() {
345 let options = Options::from(Flags::ALL_REGEXP_OPTS);
346 let source = Source::with_pattern_and_options(br"\0".to_vec(), options);
347 let config = Config::from(&source);
348 let err = Utf8::with_literal_derived_encoding(source, config, Encoding::None).unwrap_err();
349 assert!(matches!(err, Error::Regexp(err) if err.message().contains("backreferences are not supported")));
350 }
351
352 #[test]
353 fn is_literal() {
354 let options = Options::from(Flags::LITERAL);
355 let regexp = make("abc", Some(options), Encoding::None);
356 assert!(regexp.is_literal());
357
358 let options = Options::from(Flags::empty());
359 let regexp = make("abc", Some(options), Encoding::None);
360 assert!(!regexp.is_literal());
361
362 let options = Options::from(Flags::ALL_REGEXP_OPTS);
363 let regexp = make("abc", Some(options), Encoding::None);
364 assert!(!regexp.is_literal());
365
366 let regexp = make("abc", None, Encoding::None);
367 assert!(!regexp.is_literal());
368 }
369
370 #[test]
371 fn string() {
372 let test_cases = [
373 ("abc", B("abc")),
374 ("xyz", B("xyz")),
375 ("🦀", B("🦀")),
376 ("铁锈", B("铁锈")),
377 ];
378 for (pattern, string) in test_cases {
379 let regexp = make(pattern, None, Encoding::None);
380 assert_eq!(
381 regexp.string().as_bstr(),
382 string.as_bstr(),
383 "Mismatched string for pattern"
384 );
385 }
386 }
387
388 #[test]
389 fn fmt_display() {
390 let test_cases = [
391 (B("abc"), "abc"),
392 (B("xyz"), "xyz"),
393 (B("🦀"), "🦀"),
394 (B("铁锈"), "铁锈"),
395 ];
401 for (pattern, display) in test_cases {
402 let regexp = make(pattern, None, Encoding::None);
403 assert_eq!(regexp.to_string(), display, "Mismatched display impl for pattern");
404 }
405 }
406
407 #[test]
408 fn debug() {
409 let test_cases = [
410 (B("\0"), r"/\x00/", Options::default()),
411 (B("\0"), r"/\x00/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
412 (B("\0"), r"/\x00/ix", Options::from(Flags::IGNORECASE | Flags::EXTENDED)),
413 (B("\0"), r"/\x00/m", Options::from(Flags::MULTILINE)),
414 (B(b"\x0a"), "/\n/", Options::default()),
415 (B("\x0B"), "/\x0B/", Options::default()),
416 (B("\n\r\t"), "/\n\r\t/", Options::default()),
418 (B("\n\r\t"), "/\n\r\t/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
419 (
420 B("\n\r\t"),
421 "/\n\r\t/ix",
422 Options::from(Flags::IGNORECASE | Flags::EXTENDED),
423 ),
424 (B("\n\r\t"), "/\n\r\t/m", Options::from(Flags::MULTILINE)),
425 (B("\x7F"), r"/\x7F/", Options::default()),
426 (B("\x7F"), r"/\x7F/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
427 (
428 B("\x7F"),
429 r"/\x7F/ix",
430 Options::from(Flags::IGNORECASE | Flags::EXTENDED),
431 ),
432 (B("\x7F"), r"/\x7F/m", Options::from(Flags::MULTILINE)),
433 (B(r"\a"), r"/\a/", Options::default()),
434 (B(r"\a"), r"/\a/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
435 (B(r"\a"), r"/\a/ix", Options::from(Flags::IGNORECASE | Flags::EXTENDED)),
436 (B(r"\a"), r"/\a/m", Options::from(Flags::MULTILINE)),
437 (B("abc"), "/abc/", Options::default()),
438 (B("abc"), "/abc/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
439 (B("abc"), "/abc/ix", Options::from(Flags::IGNORECASE | Flags::EXTENDED)),
440 (B("abc"), "/abc/m", Options::from(Flags::MULTILINE)),
441 (B("a+b*c"), "/a+b*c/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
442 (B("xyz"), "/xyz/", Options::default()),
443 (B("xyz"), "/xyz/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
444 (B("xyz"), "/xyz/ix", Options::from(Flags::IGNORECASE | Flags::EXTENDED)),
445 (B("xyz"), "/xyz/m", Options::from(Flags::MULTILINE)),
446 (B("x+y*z"), "/x+y*z/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
447 (B("🦀💎"), "/🦀💎/", Options::default()),
448 (B("🦀💎"), "/🦀💎/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
449 (
450 B("🦀💎"),
451 "/🦀💎/ix",
452 Options::from(Flags::IGNORECASE | Flags::EXTENDED),
453 ),
454 (B("🦀💎"), "/🦀💎/m", Options::from(Flags::MULTILINE)),
455 (B("🦀+💎*"), "/🦀+💎*/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
456 (B("铁锈"), "/铁锈/", Options::default()),
457 (B("铁锈"), "/铁锈/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
458 (
459 B("铁锈"),
460 "/铁锈/ix",
461 Options::from(Flags::IGNORECASE | Flags::EXTENDED),
462 ),
463 (B("铁锈"), "/铁锈/m", Options::from(Flags::MULTILINE)),
464 (B("铁+锈*"), "/铁+锈*/mix", Options::from(Flags::ALL_REGEXP_OPTS)),
465 ];
471 for (pattern, debug, options) in test_cases {
472 let regexp = make(pattern, Some(options), Encoding::None);
473 assert_eq!(
474 regexp.debug().collect::<String>(),
475 debug,
476 "Mismatched debug iterator for pattern"
477 );
478 }
479 }
480}