spinoso_regexp/lib.rs
1#![warn(clippy::all, clippy::pedantic, clippy::undocumented_unsafe_blocks)]
2#![allow(
3 clippy::let_underscore_untyped,
4 reason = "https://github.com/rust-lang/rust-clippy/pull/10442#issuecomment-1516570154"
5)]
6#![allow(
7 clippy::question_mark,
8 reason = "https://github.com/rust-lang/rust-clippy/issues/8281"
9)]
10#![allow(clippy::manual_let_else, reason = "manual_let_else was very buggy on release")]
11#![allow(clippy::missing_errors_doc, reason = "A lot of existing code fails this lint")]
12#![allow(
13 clippy::unnecessary_lazy_evaluations,
14 reason = "https://github.com/rust-lang/rust-clippy/issues/8109"
15)]
16#![allow(
17 clippy::module_name_repetitions,
18 reason = "incompatible with how code is organized in private modules"
19)]
20#![cfg_attr(
21 test,
22 allow(clippy::non_ascii_literal, reason = "tests sometimes require UTF-8 string content")
23)]
24#![allow(unknown_lints)]
25#![warn(
26 missing_copy_implementations,
27 missing_debug_implementations,
28 missing_docs,
29 rust_2024_compatibility,
30 trivial_casts,
31 trivial_numeric_casts,
32 unused_qualifications,
33 variant_size_differences
34)]
35#![expect(missing_docs, reason = "TODO: fully document crate")]
36// Enable feature callouts in generated documentation:
37// https://doc.rust-lang.org/beta/unstable-book/language-features/doc-cfg.html
38//
39// This approach is borrowed from tokio.
40#![cfg_attr(docsrs, feature(doc_cfg))]
41#![cfg_attr(docsrs, feature(doc_alias))]
42
43// Ensure code blocks in `README.md` compile
44#[cfg(doctest)]
45#[doc = include_str!("../README.md")]
46mod readme {}
47
48use core::fmt::{self, Write as _};
49use core::num::NonZeroUsize;
50use std::borrow::Cow;
51
52use bstr::ByteSlice;
53
54mod debug;
55mod encoding;
56mod error;
57mod named_captures;
58mod options;
59mod regexp;
60mod state;
61
62pub use debug::Debug;
63pub use encoding::{Encoding, InvalidEncodingError};
64pub use error::{ArgumentError, Error, RegexpError, SyntaxError};
65pub use named_captures::NamedCaptures;
66pub use options::{Options, RegexpOption};
67pub use regexp::regex::utf8::Utf8;
68pub use state::State;
69
70bitflags::bitflags! {
71 #[derive(Default, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
72 pub struct Flags: u8 {
73 const IGNORECASE = 0b0000_0001;
74 const EXTENDED = 0b0000_0010;
75 const MULTILINE = 0b0000_0100;
76 const ALL_REGEXP_OPTS = Self::IGNORECASE.bits() | Self::EXTENDED.bits() | Self::MULTILINE.bits();
77
78 const FIXEDENCODING = 0b0001_0000;
79 const NOENCODING = 0b0010_0000;
80
81 const LITERAL = 0b1000_0000;
82 }
83}
84
85/// The string matched by the last successful match.
86pub const LAST_MATCHED_STRING: &[u8] = b"$&";
87
88/// The string to the left of the last successful match.
89pub const STRING_LEFT_OF_MATCH: &[u8] = b"$`";
90
91/// The string to the right of the last successful match.
92pub const STRING_RIGHT_OF_MATCH: &[u8] = b"$'";
93
94/// The highest group matched by the last successful match.
95// TODO: implement this.
96pub const HIGHEST_MATCH_GROUP: &[u8] = b"$+";
97
98/// The information about the last match in the current scope.
99pub const LAST_MATCH: &[u8] = b"$~";
100
101/// A `Source` represents the literal contents used to construct a given
102/// `Regexp`.
103///
104/// When [`Regexp`]s are constructed with a `/.../` literal, [`Regexp#source`]
105/// refers to the literal characters contained within the `/` delimiters.
106/// For example, `/\t/.source.bytes` has byte sequence `[92, 116]`.
107///
108/// When `Regexp`s are constructed with [`Regexp::compile`], [`Regexp#source`]
109/// refers to the argument passed to `compile`. For example,
110/// `Regexp.compile("\t").source.bytes` has byte sequence `[9]`.
111///
112/// [`Regexp#inspect`] prints `"/#{source}/"`.
113///
114/// [`Regexp`]: https://ruby-doc.org/core-3.1.2/Regexp.html
115/// [`Regexp#source`]: https://ruby-doc.org/core-3.1.2/Regexp.html#method-i-source
116/// [`Regexp::compile`]: https://ruby-doc.org/core-3.1.2/Regexp.html#method-c-compile
117/// [`Regexp#inspect`]: https://ruby-doc.org/core-3.1.2/Regexp.html#method-i-inspect
118#[derive(Default, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
119pub struct Source {
120 pattern: Vec<u8>,
121 options: Options,
122}
123
124impl fmt::Debug for Source {
125 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
126 f.debug_struct("Source")
127 .field("pattern", &self.pattern.as_bstr())
128 .field("options", &self.options)
129 .finish()
130 }
131}
132
133impl From<Config> for Source {
134 fn from(config: Config) -> Self {
135 Self::with_pattern_and_options(config.pattern, config.options)
136 }
137}
138
139impl From<&Config> for Source {
140 fn from(config: &Config) -> Self {
141 Self::with_pattern_and_options(config.pattern.clone(), config.options)
142 }
143}
144
145impl Source {
146 /// Construct a new, empty `Source`.
147 ///
148 /// # Examples
149 ///
150 /// ```
151 /// use spinoso_regexp::Source;
152 ///
153 /// const SOURCE: Source = Source::new();
154 /// assert!(SOURCE.pattern().is_empty());
155 /// assert!(SOURCE.options().as_display_modifier().is_empty());
156 /// ```
157 #[must_use]
158 pub const fn new() -> Self {
159 Self {
160 pattern: Vec::new(),
161 options: Options::new(),
162 }
163 }
164
165 /// Construct a new `Source` with the given pattern and [`Options`].
166 ///
167 /// # Examples
168 ///
169 /// ```
170 /// use spinoso_regexp::{Options, Source};
171 ///
172 /// let source = Source::with_pattern_and_options(
173 /// b"Artichoke( Ruby)?".to_vec(),
174 /// Options::with_ignore_case(),
175 /// );
176 /// assert_eq!(source.pattern(), b"Artichoke( Ruby)?");
177 /// assert_eq!(source.options().as_display_modifier(), "i");
178 /// ```
179 #[must_use]
180 pub const fn with_pattern_and_options(pattern: Vec<u8>, options: Options) -> Self {
181 Self { pattern, options }
182 }
183
184 /// Whether this source was parsed with ignore case enabled.
185 ///
186 /// # Examples
187 ///
188 /// ```
189 /// use spinoso_regexp::{Options, Source};
190 ///
191 /// let source = Source::new();
192 /// assert!(!source.is_casefold());
193 ///
194 /// let source = Source::with_pattern_and_options(
195 /// b"Artichoke( Ruby)?".to_vec(),
196 /// Options::with_ignore_case(),
197 /// );
198 /// assert!(source.is_casefold());
199 /// ```
200 #[must_use]
201 pub const fn is_casefold(&self) -> bool {
202 self.options.ignore_case().is_enabled()
203 }
204
205 /// Whether the Regexp was parsed as a literal, e.g. `'/artichoke/i`.
206 ///
207 /// This enables Ruby parsers to inject whether a Regexp is a literal to the
208 /// core library. Literal Regexps have some special behavior regarding
209 /// capturing groups and report parse failures differently.
210 ///
211 /// A source's literal flag can only be set using [`Options::try_from_int`].
212 #[must_use]
213 pub const fn is_literal(&self) -> bool {
214 self.options.is_literal()
215 }
216
217 /// Extracts a slice containing the entire pattern.
218 ///
219 /// # Examples
220 ///
221 /// ```
222 /// use spinoso_regexp::{Options, Source};
223 ///
224 /// let source = Source::with_pattern_and_options(
225 /// b"Artichoke( Ruby)?".to_vec(),
226 /// Options::with_ignore_case(),
227 /// );
228 /// assert_eq!(source.pattern(), b"Artichoke( Ruby)?");
229 /// ```
230 #[must_use]
231 pub fn pattern(&self) -> &[u8] {
232 self.pattern.as_slice()
233 }
234
235 /// Return a copy of the underlying [`Options`].
236 ///
237 /// # Examples
238 ///
239 /// ```
240 /// use spinoso_regexp::{Options, Source};
241 ///
242 /// let source = Source::with_pattern_and_options(
243 /// b"Artichoke( Ruby)?".to_vec(),
244 /// Options::with_ignore_case(),
245 /// );
246 /// assert_eq!(source.options().as_display_modifier(), "i");
247 /// ```
248 #[must_use]
249 pub const fn options(&self) -> Options {
250 self.options
251 }
252}
253
254/// A `Config` represents the parsed, expanded, and normalized pattern and
255/// options used to initialize a `Regexp`.
256///
257/// A `Config` is derived from a [`Source`].
258///
259/// When a `Regexp` is cloned, it is cloned from its compiled `Config`.
260#[derive(Default, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
261pub struct Config {
262 pattern: Vec<u8>,
263 options: Options,
264}
265
266impl fmt::Debug for Config {
267 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
268 f.debug_struct("Config")
269 .field("pattern", &self.pattern.as_bstr())
270 .field("options", &self.options)
271 .finish()
272 }
273}
274
275impl From<Source> for Config {
276 fn from(source: Source) -> Self {
277 Self::with_pattern_and_options(source.pattern, source.options)
278 }
279}
280
281impl From<&Source> for Config {
282 fn from(source: &Source) -> Self {
283 Self::with_pattern_and_options(source.pattern.clone(), source.options)
284 }
285}
286
287impl Config {
288 /// Construct a new, empty `Config`.
289 ///
290 /// # Examples
291 ///
292 /// ```
293 /// use spinoso_regexp::Config;
294 ///
295 /// const CONFIG: Config = Config::new();
296 /// assert!(CONFIG.pattern().is_empty());
297 /// assert!(CONFIG.options().as_display_modifier().is_empty());
298 /// ```
299 #[must_use]
300 pub const fn new() -> Self {
301 Self {
302 pattern: Vec::new(),
303 options: Options::new(),
304 }
305 }
306
307 /// Construct a new `Config` with the given pattern and [`Options`].
308 ///
309 /// # Examples
310 ///
311 /// ```
312 /// use spinoso_regexp::{Config, Options};
313 ///
314 /// let config = Config::with_pattern_and_options(
315 /// b"Artichoke( Ruby)?".to_vec(),
316 /// Options::with_ignore_case(),
317 /// );
318 /// assert_eq!(config.pattern(), b"Artichoke( Ruby)?");
319 /// assert_eq!(config.options().as_display_modifier(), "i");
320 /// ```
321 #[must_use]
322 pub const fn with_pattern_and_options(pattern: Vec<u8>, options: Options) -> Self {
323 Self { pattern, options }
324 }
325
326 /// Extracts a slice containing the entire pattern.
327 ///
328 /// # Examples
329 ///
330 /// ```
331 /// use spinoso_regexp::{Config, Options};
332 ///
333 /// let config = Config::with_pattern_and_options(
334 /// b"Artichoke( Ruby)?".to_vec(),
335 /// Options::with_ignore_case(),
336 /// );
337 /// assert_eq!(config.pattern(), b"Artichoke( Ruby)?");
338 /// ```
339 #[must_use]
340 pub fn pattern(&self) -> &[u8] {
341 self.pattern.as_slice()
342 }
343
344 /// Return a copy of the underlying [`Options`].
345 ///
346 /// # Examples
347 ///
348 /// ```
349 /// use spinoso_regexp::{Config, Options};
350 ///
351 /// let config = Config::with_pattern_and_options(
352 /// b"Artichoke( Ruby)?".to_vec(),
353 /// Options::with_ignore_case(),
354 /// );
355 /// assert_eq!(config.options().as_display_modifier(), "i");
356 /// ```
357 #[must_use]
358 pub const fn options(&self) -> Options {
359 self.options
360 }
361}
362
363/// Global variable name for the nth capture group from a `Regexp` match.
364///
365/// Ruby tags captures from the last `Regexp` match with global variables of the
366/// form `$1`, `$2`, `$3`, etc. This function accepts [`NonZeroUsize`] because
367/// `$0` is not a valid `Regexp` capture group name in Ruby (`$0` refers to the
368/// program name).
369///
370/// This function may return either a `&'static str` or an owned [`String`] for
371/// a given capture group name. This function differs from
372/// [`nth_match_group_bytes`] by returning `Cow<'static, str>`.
373///
374///
375/// # Examples
376///
377/// ```
378/// use core::num::NonZeroUsize;
379///
380/// use spinoso_regexp::nth_match_group;
381///
382/// # fn example() -> Option<()> {
383/// let group = NonZeroUsize::new(1)?;
384/// let global_name = nth_match_group(group);
385/// assert_eq!(&*global_name, "$1");
386///
387/// let group = NonZeroUsize::new(27)?;
388/// let global_name = nth_match_group(group);
389/// assert_eq!(&*global_name, "$27");
390/// # None
391/// # }
392/// ```
393#[must_use]
394pub fn nth_match_group(group: NonZeroUsize) -> Cow<'static, str> {
395 match group.get() {
396 1 => Cow::Borrowed("$1"),
397 2 => Cow::Borrowed("$2"),
398 3 => Cow::Borrowed("$3"),
399 4 => Cow::Borrowed("$4"),
400 5 => Cow::Borrowed("$5"),
401 6 => Cow::Borrowed("$6"),
402 7 => Cow::Borrowed("$7"),
403 8 => Cow::Borrowed("$8"),
404 9 => Cow::Borrowed("$9"),
405 10 => Cow::Borrowed("$10"),
406 11 => Cow::Borrowed("$11"),
407 12 => Cow::Borrowed("$12"),
408 13 => Cow::Borrowed("$13"),
409 14 => Cow::Borrowed("$14"),
410 15 => Cow::Borrowed("$15"),
411 16 => Cow::Borrowed("$16"),
412 17 => Cow::Borrowed("$17"),
413 18 => Cow::Borrowed("$18"),
414 19 => Cow::Borrowed("$19"),
415 20 => Cow::Borrowed("$20"),
416 num => {
417 let mut buf = String::new();
418 // Suppress formatting errors because this function is infallible.
419 //
420 // In practice `write!` will never error because the `fmt::Write`
421 // impl for `String` never panics.
422 let _ignored = write!(&mut buf, "${num}");
423 Cow::Owned(buf)
424 }
425 }
426}
427
428/// Global variable name for the nth capture group from a `Regexp` match.
429///
430/// Ruby tags captures from the last `Regexp` match with global variables of the
431/// form `$1`, `$2`, `$3`, etc. This function accepts [`NonZeroUsize`] because
432/// `$0` is not a valid `Regexp` capture group name in Ruby (`$0` refers to the
433/// program name).
434///
435/// This function may return either a `&'static [u8]` or an owned [`Vec<u8>`]
436/// for a given capture group name. This function differs from
437/// [`nth_match_group`] by returning `Cow<'static, [u8]>`.
438///
439/// # Examples
440///
441/// ```
442/// use core::num::NonZeroUsize;
443///
444/// use spinoso_regexp::nth_match_group_bytes;
445///
446/// # fn example() -> Option<()> {
447/// let group = NonZeroUsize::new(1)?;
448/// let global_name = nth_match_group_bytes(group);
449/// assert_eq!(&*global_name, b"$1");
450///
451/// let group = NonZeroUsize::new(27)?;
452/// let global_name = nth_match_group_bytes(group);
453/// assert_eq!(&*global_name, b"$27");
454/// # None
455/// # }
456/// ```
457///
458/// [`Vec<u8>`]: std::vec::Vec
459#[must_use]
460pub fn nth_match_group_bytes(group: NonZeroUsize) -> Cow<'static, [u8]> {
461 match nth_match_group(group) {
462 Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
463 Cow::Owned(s) => Cow::Owned(s.into_bytes()),
464 }
465}
466
467#[cfg(test)]
468mod tests {
469 use core::num::NonZeroUsize;
470 use std::borrow::Cow;
471
472 use super::{nth_match_group, nth_match_group_bytes};
473
474 #[test]
475 fn match_group_symbol() {
476 for num in 1..=1024 {
477 let num = NonZeroUsize::new(num).unwrap();
478 let sym = nth_match_group(num);
479 let num = format!("{num}");
480 assert!(sym.len() > 1);
481 assert_eq!(&sym[0..1], "$");
482 assert_eq!(&sym[1..], num);
483 }
484 }
485
486 #[test]
487 fn some_globals_are_static_slices() {
488 for num in 1..=20 {
489 let num = NonZeroUsize::new(num).unwrap();
490 let sym = nth_match_group(num);
491 assert!(matches!(sym, Cow::Borrowed(_)));
492 }
493 for num in 21..=1024 {
494 let num = NonZeroUsize::new(num).unwrap();
495 let sym = nth_match_group(num);
496 assert!(matches!(sym, Cow::Owned(_)));
497 }
498 }
499
500 #[test]
501 fn nth_group_matches_nth_group_bytes() {
502 for num in 1..=1024 {
503 let num = NonZeroUsize::new(num).unwrap();
504 let sym_str = nth_match_group(num);
505 let sym_bytes = nth_match_group_bytes(num);
506 assert_eq!(sym_str.as_bytes(), &*sym_bytes);
507 }
508 }
509}