spinoso_regexp/
lib.rs

1#![warn(clippy::all, clippy::pedantic, clippy::undocumented_unsafe_blocks)]
2#![allow(
3    clippy::let_underscore_untyped,
4    reason = "https://github.com/rust-lang/rust-clippy/pull/10442#issuecomment-1516570154"
5)]
6#![allow(
7    clippy::question_mark,
8    reason = "https://github.com/rust-lang/rust-clippy/issues/8281"
9)]
10#![allow(clippy::manual_let_else, reason = "manual_let_else was very buggy on release")]
11#![allow(clippy::missing_errors_doc, reason = "A lot of existing code fails this lint")]
12#![allow(
13    clippy::unnecessary_lazy_evaluations,
14    reason = "https://github.com/rust-lang/rust-clippy/issues/8109"
15)]
16#![allow(
17    clippy::module_name_repetitions,
18    reason = "incompatible with how code is organized in private modules"
19)]
20#![cfg_attr(
21    test,
22    allow(clippy::non_ascii_literal, reason = "tests sometimes require UTF-8 string content")
23)]
24#![allow(unknown_lints)]
25#![warn(
26    missing_copy_implementations,
27    missing_debug_implementations,
28    missing_docs,
29    rust_2024_compatibility,
30    trivial_casts,
31    trivial_numeric_casts,
32    unused_qualifications,
33    variant_size_differences
34)]
35#![expect(missing_docs, reason = "TODO: fully document crate")]
36// Enable feature callouts in generated documentation:
37// https://doc.rust-lang.org/beta/unstable-book/language-features/doc-cfg.html
38//
39// This approach is borrowed from tokio.
40#![cfg_attr(docsrs, feature(doc_cfg))]
41#![cfg_attr(docsrs, feature(doc_alias))]
42
43// Ensure code blocks in `README.md` compile
44#[cfg(doctest)]
45#[doc = include_str!("../README.md")]
46mod readme {}
47
48use core::fmt::{self, Write as _};
49use core::num::NonZeroUsize;
50use std::borrow::Cow;
51
52use bstr::ByteSlice;
53
54mod debug;
55mod encoding;
56mod error;
57mod named_captures;
58mod options;
59mod regexp;
60mod state;
61
62pub use debug::Debug;
63pub use encoding::{Encoding, InvalidEncodingError};
64pub use error::{ArgumentError, Error, RegexpError, SyntaxError};
65pub use named_captures::NamedCaptures;
66pub use options::{Options, RegexpOption};
67pub use regexp::regex::utf8::Utf8;
68pub use state::State;
69
70bitflags::bitflags! {
71    #[derive(Default, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
72    pub struct Flags: u8 {
73        const IGNORECASE      = 0b0000_0001;
74        const EXTENDED        = 0b0000_0010;
75        const MULTILINE       = 0b0000_0100;
76        const ALL_REGEXP_OPTS = Self::IGNORECASE.bits() | Self::EXTENDED.bits() | Self::MULTILINE.bits();
77
78        const FIXEDENCODING   = 0b0001_0000;
79        const NOENCODING      = 0b0010_0000;
80
81        const LITERAL         = 0b1000_0000;
82    }
83}
84
85/// The string matched by the last successful match.
86pub const LAST_MATCHED_STRING: &[u8] = b"$&";
87
88/// The string to the left of the last successful match.
89pub const STRING_LEFT_OF_MATCH: &[u8] = b"$`";
90
91/// The string to the right of the last successful match.
92pub const STRING_RIGHT_OF_MATCH: &[u8] = b"$'";
93
94/// The highest group matched by the last successful match.
95// TODO: implement this.
96pub const HIGHEST_MATCH_GROUP: &[u8] = b"$+";
97
98/// The information about the last match in the current scope.
99pub const LAST_MATCH: &[u8] = b"$~";
100
101/// A `Source` represents the literal contents used to construct a given
102/// `Regexp`.
103///
104/// When [`Regexp`]s are constructed with a `/.../` literal, [`Regexp#source`]
105/// refers to the literal characters contained within the `/` delimiters.
106/// For example, `/\t/.source.bytes` has byte sequence `[92, 116]`.
107///
108/// When `Regexp`s are constructed with [`Regexp::compile`], [`Regexp#source`]
109/// refers to the argument passed to `compile`. For example,
110/// `Regexp.compile("\t").source.bytes` has byte sequence `[9]`.
111///
112/// [`Regexp#inspect`] prints `"/#{source}/"`.
113///
114/// [`Regexp`]: https://ruby-doc.org/core-3.1.2/Regexp.html
115/// [`Regexp#source`]: https://ruby-doc.org/core-3.1.2/Regexp.html#method-i-source
116/// [`Regexp::compile`]: https://ruby-doc.org/core-3.1.2/Regexp.html#method-c-compile
117/// [`Regexp#inspect`]: https://ruby-doc.org/core-3.1.2/Regexp.html#method-i-inspect
118#[derive(Default, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
119pub struct Source {
120    pattern: Vec<u8>,
121    options: Options,
122}
123
124impl fmt::Debug for Source {
125    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
126        f.debug_struct("Source")
127            .field("pattern", &self.pattern.as_bstr())
128            .field("options", &self.options)
129            .finish()
130    }
131}
132
133impl From<Config> for Source {
134    fn from(config: Config) -> Self {
135        Self::with_pattern_and_options(config.pattern, config.options)
136    }
137}
138
139impl From<&Config> for Source {
140    fn from(config: &Config) -> Self {
141        Self::with_pattern_and_options(config.pattern.clone(), config.options)
142    }
143}
144
145impl Source {
146    /// Construct a new, empty `Source`.
147    ///
148    /// # Examples
149    ///
150    /// ```
151    /// use spinoso_regexp::Source;
152    ///
153    /// const SOURCE: Source = Source::new();
154    /// assert!(SOURCE.pattern().is_empty());
155    /// assert!(SOURCE.options().as_display_modifier().is_empty());
156    /// ```
157    #[must_use]
158    pub const fn new() -> Self {
159        Self {
160            pattern: Vec::new(),
161            options: Options::new(),
162        }
163    }
164
165    /// Construct a new `Source` with the given pattern and [`Options`].
166    ///
167    /// # Examples
168    ///
169    /// ```
170    /// use spinoso_regexp::{Options, Source};
171    ///
172    /// let source = Source::with_pattern_and_options(
173    ///     b"Artichoke( Ruby)?".to_vec(),
174    ///     Options::with_ignore_case(),
175    /// );
176    /// assert_eq!(source.pattern(), b"Artichoke( Ruby)?");
177    /// assert_eq!(source.options().as_display_modifier(), "i");
178    /// ```
179    #[must_use]
180    pub const fn with_pattern_and_options(pattern: Vec<u8>, options: Options) -> Self {
181        Self { pattern, options }
182    }
183
184    /// Whether this source was parsed with ignore case enabled.
185    ///
186    /// # Examples
187    ///
188    /// ```
189    /// use spinoso_regexp::{Options, Source};
190    ///
191    /// let source = Source::new();
192    /// assert!(!source.is_casefold());
193    ///
194    /// let source = Source::with_pattern_and_options(
195    ///     b"Artichoke( Ruby)?".to_vec(),
196    ///     Options::with_ignore_case(),
197    /// );
198    /// assert!(source.is_casefold());
199    /// ```
200    #[must_use]
201    pub const fn is_casefold(&self) -> bool {
202        self.options.ignore_case().is_enabled()
203    }
204
205    /// Whether the Regexp was parsed as a literal, e.g. `'/artichoke/i`.
206    ///
207    /// This enables Ruby parsers to inject whether a Regexp is a literal to the
208    /// core library. Literal Regexps have some special behavior regarding
209    /// capturing groups and report parse failures differently.
210    ///
211    /// A source's literal flag can only be set using [`Options::try_from_int`].
212    #[must_use]
213    pub const fn is_literal(&self) -> bool {
214        self.options.is_literal()
215    }
216
217    /// Extracts a slice containing the entire pattern.
218    ///
219    /// # Examples
220    ///
221    /// ```
222    /// use spinoso_regexp::{Options, Source};
223    ///
224    /// let source = Source::with_pattern_and_options(
225    ///     b"Artichoke( Ruby)?".to_vec(),
226    ///     Options::with_ignore_case(),
227    /// );
228    /// assert_eq!(source.pattern(), b"Artichoke( Ruby)?");
229    /// ```
230    #[must_use]
231    pub fn pattern(&self) -> &[u8] {
232        self.pattern.as_slice()
233    }
234
235    /// Return a copy of the underlying [`Options`].
236    ///
237    /// # Examples
238    ///
239    /// ```
240    /// use spinoso_regexp::{Options, Source};
241    ///
242    /// let source = Source::with_pattern_and_options(
243    ///     b"Artichoke( Ruby)?".to_vec(),
244    ///     Options::with_ignore_case(),
245    /// );
246    /// assert_eq!(source.options().as_display_modifier(), "i");
247    /// ```
248    #[must_use]
249    pub const fn options(&self) -> Options {
250        self.options
251    }
252}
253
254/// A `Config` represents the parsed, expanded, and normalized pattern and
255/// options used to initialize a `Regexp`.
256///
257/// A `Config` is derived from a [`Source`].
258///
259/// When a `Regexp` is cloned, it is cloned from its compiled `Config`.
260#[derive(Default, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
261pub struct Config {
262    pattern: Vec<u8>,
263    options: Options,
264}
265
266impl fmt::Debug for Config {
267    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
268        f.debug_struct("Config")
269            .field("pattern", &self.pattern.as_bstr())
270            .field("options", &self.options)
271            .finish()
272    }
273}
274
275impl From<Source> for Config {
276    fn from(source: Source) -> Self {
277        Self::with_pattern_and_options(source.pattern, source.options)
278    }
279}
280
281impl From<&Source> for Config {
282    fn from(source: &Source) -> Self {
283        Self::with_pattern_and_options(source.pattern.clone(), source.options)
284    }
285}
286
287impl Config {
288    /// Construct a new, empty `Config`.
289    ///
290    /// # Examples
291    ///
292    /// ```
293    /// use spinoso_regexp::Config;
294    ///
295    /// const CONFIG: Config = Config::new();
296    /// assert!(CONFIG.pattern().is_empty());
297    /// assert!(CONFIG.options().as_display_modifier().is_empty());
298    /// ```
299    #[must_use]
300    pub const fn new() -> Self {
301        Self {
302            pattern: Vec::new(),
303            options: Options::new(),
304        }
305    }
306
307    /// Construct a new `Config` with the given pattern and [`Options`].
308    ///
309    /// # Examples
310    ///
311    /// ```
312    /// use spinoso_regexp::{Config, Options};
313    ///
314    /// let config = Config::with_pattern_and_options(
315    ///     b"Artichoke( Ruby)?".to_vec(),
316    ///     Options::with_ignore_case(),
317    /// );
318    /// assert_eq!(config.pattern(), b"Artichoke( Ruby)?");
319    /// assert_eq!(config.options().as_display_modifier(), "i");
320    /// ```
321    #[must_use]
322    pub const fn with_pattern_and_options(pattern: Vec<u8>, options: Options) -> Self {
323        Self { pattern, options }
324    }
325
326    /// Extracts a slice containing the entire pattern.
327    ///
328    /// # Examples
329    ///
330    /// ```
331    /// use spinoso_regexp::{Config, Options};
332    ///
333    /// let config = Config::with_pattern_and_options(
334    ///     b"Artichoke( Ruby)?".to_vec(),
335    ///     Options::with_ignore_case(),
336    /// );
337    /// assert_eq!(config.pattern(), b"Artichoke( Ruby)?");
338    /// ```
339    #[must_use]
340    pub fn pattern(&self) -> &[u8] {
341        self.pattern.as_slice()
342    }
343
344    /// Return a copy of the underlying [`Options`].
345    ///
346    /// # Examples
347    ///
348    /// ```
349    /// use spinoso_regexp::{Config, Options};
350    ///
351    /// let config = Config::with_pattern_and_options(
352    ///     b"Artichoke( Ruby)?".to_vec(),
353    ///     Options::with_ignore_case(),
354    /// );
355    /// assert_eq!(config.options().as_display_modifier(), "i");
356    /// ```
357    #[must_use]
358    pub const fn options(&self) -> Options {
359        self.options
360    }
361}
362
363/// Global variable name for the nth capture group from a `Regexp` match.
364///
365/// Ruby tags captures from the last `Regexp` match with global variables of the
366/// form `$1`, `$2`, `$3`, etc. This function accepts [`NonZeroUsize`] because
367/// `$0` is not a valid `Regexp` capture group name in Ruby (`$0` refers to the
368/// program name).
369///
370/// This function may return either a `&'static str` or an owned [`String`] for
371/// a given capture group name. This function differs from
372/// [`nth_match_group_bytes`] by returning `Cow<'static, str>`.
373///
374///
375/// # Examples
376///
377/// ```
378/// use core::num::NonZeroUsize;
379///
380/// use spinoso_regexp::nth_match_group;
381///
382/// # fn example() -> Option<()> {
383/// let group = NonZeroUsize::new(1)?;
384/// let global_name = nth_match_group(group);
385/// assert_eq!(&*global_name, "$1");
386///
387/// let group = NonZeroUsize::new(27)?;
388/// let global_name = nth_match_group(group);
389/// assert_eq!(&*global_name, "$27");
390/// # None
391/// # }
392/// ```
393#[must_use]
394pub fn nth_match_group(group: NonZeroUsize) -> Cow<'static, str> {
395    match group.get() {
396        1 => Cow::Borrowed("$1"),
397        2 => Cow::Borrowed("$2"),
398        3 => Cow::Borrowed("$3"),
399        4 => Cow::Borrowed("$4"),
400        5 => Cow::Borrowed("$5"),
401        6 => Cow::Borrowed("$6"),
402        7 => Cow::Borrowed("$7"),
403        8 => Cow::Borrowed("$8"),
404        9 => Cow::Borrowed("$9"),
405        10 => Cow::Borrowed("$10"),
406        11 => Cow::Borrowed("$11"),
407        12 => Cow::Borrowed("$12"),
408        13 => Cow::Borrowed("$13"),
409        14 => Cow::Borrowed("$14"),
410        15 => Cow::Borrowed("$15"),
411        16 => Cow::Borrowed("$16"),
412        17 => Cow::Borrowed("$17"),
413        18 => Cow::Borrowed("$18"),
414        19 => Cow::Borrowed("$19"),
415        20 => Cow::Borrowed("$20"),
416        num => {
417            let mut buf = String::new();
418            // Suppress formatting errors because this function is infallible.
419            //
420            // In practice `write!` will never error because the `fmt::Write`
421            // impl for `String` never panics.
422            let _ignored = write!(&mut buf, "${num}");
423            Cow::Owned(buf)
424        }
425    }
426}
427
428/// Global variable name for the nth capture group from a `Regexp` match.
429///
430/// Ruby tags captures from the last `Regexp` match with global variables of the
431/// form `$1`, `$2`, `$3`, etc. This function accepts [`NonZeroUsize`] because
432/// `$0` is not a valid `Regexp` capture group name in Ruby (`$0` refers to the
433/// program name).
434///
435/// This function may return either a `&'static [u8]` or an owned [`Vec<u8>`]
436/// for a given capture group name.  This function differs from
437/// [`nth_match_group`] by returning `Cow<'static, [u8]>`.
438///
439/// # Examples
440///
441/// ```
442/// use core::num::NonZeroUsize;
443///
444/// use spinoso_regexp::nth_match_group_bytes;
445///
446/// # fn example() -> Option<()> {
447/// let group = NonZeroUsize::new(1)?;
448/// let global_name = nth_match_group_bytes(group);
449/// assert_eq!(&*global_name, b"$1");
450///
451/// let group = NonZeroUsize::new(27)?;
452/// let global_name = nth_match_group_bytes(group);
453/// assert_eq!(&*global_name, b"$27");
454/// # None
455/// # }
456/// ```
457///
458/// [`Vec<u8>`]: std::vec::Vec
459#[must_use]
460pub fn nth_match_group_bytes(group: NonZeroUsize) -> Cow<'static, [u8]> {
461    match nth_match_group(group) {
462        Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
463        Cow::Owned(s) => Cow::Owned(s.into_bytes()),
464    }
465}
466
467#[cfg(test)]
468mod tests {
469    use core::num::NonZeroUsize;
470    use std::borrow::Cow;
471
472    use super::{nth_match_group, nth_match_group_bytes};
473
474    #[test]
475    fn match_group_symbol() {
476        for num in 1..=1024 {
477            let num = NonZeroUsize::new(num).unwrap();
478            let sym = nth_match_group(num);
479            let num = format!("{num}");
480            assert!(sym.len() > 1);
481            assert_eq!(&sym[0..1], "$");
482            assert_eq!(&sym[1..], num);
483        }
484    }
485
486    #[test]
487    fn some_globals_are_static_slices() {
488        for num in 1..=20 {
489            let num = NonZeroUsize::new(num).unwrap();
490            let sym = nth_match_group(num);
491            assert!(matches!(sym, Cow::Borrowed(_)));
492        }
493        for num in 21..=1024 {
494            let num = NonZeroUsize::new(num).unwrap();
495            let sym = nth_match_group(num);
496            assert!(matches!(sym, Cow::Owned(_)));
497        }
498    }
499
500    #[test]
501    fn nth_group_matches_nth_group_bytes() {
502        for num in 1..=1024 {
503            let num = NonZeroUsize::new(num).unwrap();
504            let sym_str = nth_match_group(num);
505            let sym_bytes = nth_match_group_bytes(num);
506            assert_eq!(sym_str.as_bytes(), &*sym_bytes);
507        }
508    }
509}