spinoso_regexp/
debug.rs

1use core::iter::FusedIterator;
2
3use scolapasta_string_escape::InvalidUtf8ByteSequence;
4
5#[derive(Debug, Clone)]
6struct Delimiters {
7    bits: u8,
8}
9
10impl Default for Delimiters {
11    fn default() -> Self {
12        Self::DEFAULT
13    }
14}
15
16impl Delimiters {
17    const EMIT_LEFT_DELIMITER: Self = Self { bits: 0b0000_0001 };
18    const EMIT_RIGHT_DELIMITER: Self = Self { bits: 0b0000_0010 };
19
20    const DEFAULT: Self = Self {
21        bits: Self::EMIT_LEFT_DELIMITER.bits | Self::EMIT_RIGHT_DELIMITER.bits,
22    };
23
24    #[inline]
25    fn emit_left_delimiter(&mut self) -> Option<char> {
26        if (self.bits & Self::EMIT_LEFT_DELIMITER.bits) == Self::EMIT_LEFT_DELIMITER.bits {
27            self.bits &= !Self::EMIT_LEFT_DELIMITER.bits;
28            Some('/')
29        } else {
30            None
31        }
32    }
33
34    #[inline]
35    fn emit_right_delimiter(&mut self) -> Option<char> {
36        if (self.bits & Self::EMIT_RIGHT_DELIMITER.bits) == Self::EMIT_RIGHT_DELIMITER.bits {
37            self.bits &= !Self::EMIT_RIGHT_DELIMITER.bits;
38            Some('/')
39        } else {
40            None
41        }
42    }
43}
44
45/// An iterator that yields a debug representation of a `Regexp` as a sequence
46/// of `char`s.
47///
48/// This struct is created by the `debug` method on the regexp implementations
49/// in this crate. See these functions' documentation for more.
50///
51/// This iterator can be used to implement Ruby's [`Regexp#inspect`].
52///
53/// # Examples
54///
55/// UTF-8 regexp patterns and options are formatted in a debug
56/// representation:
57///
58/// ```
59/// use spinoso_regexp::Debug;
60///
61/// let debug = Debug::new("crab 🦀 for Rust".as_bytes(), "mix", "");
62/// let s = debug.collect::<String>();
63/// assert_eq!(s, "/crab 🦀 for Rust/mix");
64/// ```
65///
66/// Binary content is hex escaped:
67///
68/// ```
69/// use spinoso_regexp::Debug;
70///
71/// let debug = Debug::new(b"\xFF\xFE", "", "");
72/// let s = debug.collect::<String>();
73/// assert_eq!(s, r"/\xFF\xFE/");
74/// ```
75///
76/// [`Regexp#inspect`]: https://ruby-doc.org/core-2.4.1/Regexp.html#method-i-inspect
77#[derive(Default, Debug, Clone)]
78#[must_use = "this `Debug` is an `Iterator`, which should be consumed if constructed"]
79pub struct Debug<'a> {
80    delimiters: Delimiters,
81    // When `Regexp`s are constructed with a `/.../` literal, `Regexp#source`
82    // refers to the literal characters contained within the `/` delimiters.
83    // For example, `/\t/.source.bytes` has byte sequence `[92, 116]`.
84    //
85    // When `Regexp`s are constructed with `Regexp::compile`, `Regexp#source`
86    // refers to the argument passed to `compile`. For example,
87    // `Regexp.compile("\t").source.bytes` has byte sequence `[9]`.
88    //
89    // `Regexp#inspect` prints `"/#{source}/"`.
90    source: &'a [u8],
91    non_standard_control_escapes: &'static [u8],
92    literal: InvalidUtf8ByteSequence,
93    options: &'static str,
94    encoding: &'static str,
95}
96
97impl<'a> Debug<'a> {
98    /// Construct a new `Debug` iterator with a regexp source, [options
99    /// modifiers], and [encoding modifiers].
100    ///
101    /// # Examples
102    ///
103    /// UTF-8 regexp patterns and options are formatted in a debug
104    /// representation:
105    ///
106    /// ```
107    /// use spinoso_regexp::Debug;
108    ///
109    /// let debug = Debug::new("crab 🦀 for Rust".as_bytes(), "mix", "");
110    /// let s = debug.collect::<String>();
111    /// assert_eq!(s, "/crab 🦀 for Rust/mix");
112    /// ```
113    ///
114    /// Binary content is hex escaped:
115    ///
116    /// ```
117    /// use spinoso_regexp::Debug;
118    ///
119    /// let debug = Debug::new(b"\xFF\xFE", "", "");
120    /// let s = debug.collect::<String>();
121    /// assert_eq!(s, r"/\xFF\xFE/");
122    /// ```
123    ///
124    /// [options modifiers]: crate::Options::as_display_modifier
125    /// [encoding modifiers]: crate::Encoding::as_modifier_str
126    pub fn new(source: &'a [u8], options: &'static str, encoding: &'static str) -> Self {
127        Self {
128            delimiters: Delimiters::DEFAULT,
129            source,
130            non_standard_control_escapes: &[],
131            literal: InvalidUtf8ByteSequence::new(),
132            options,
133            encoding,
134        }
135    }
136}
137
138impl Iterator for Debug<'_> {
139    type Item = char;
140
141    fn next(&mut self) -> Option<Self::Item> {
142        if let Some(prefix) = self.delimiters.emit_left_delimiter() {
143            return Some(prefix);
144        }
145        if let Some((&next, tail)) = self.non_standard_control_escapes.split_first() {
146            self.non_standard_control_escapes = tail;
147            return Some(next.into());
148        }
149        if let Some(literal) = self.literal.next() {
150            return Some(literal);
151        }
152        if !self.source.is_empty() {
153            let (ch, size) = bstr::decode_utf8(self.source);
154            // SAFETY: bstr guarantees that the size is within the bounds of the slice.
155            let (chunk, remainder) = unsafe { self.source.split_at_unchecked(size) };
156            self.source = remainder;
157
158            return match ch {
159                // '/' is the `Regexp` literal delimiter, so escape it.
160                Some('/') => {
161                    // While not an invalid byte, we rely on the documented
162                    // behavior of `InvalidUtf8ByteSequence` to always escape
163                    // any bytes given to it.
164                    self.literal = InvalidUtf8ByteSequence::with_byte(b'/');
165                    Some('\\')
166                }
167                Some('\x07') => {
168                    let (&next, tail) = br"\x07".split_first().unwrap();
169                    self.non_standard_control_escapes = tail;
170                    Some(next.into())
171                }
172                Some('\x08') => {
173                    let (&next, tail) = br"\x08".split_first().unwrap();
174                    self.non_standard_control_escapes = tail;
175                    Some(next.into())
176                }
177                Some('\x1B') => {
178                    let (&next, tail) = br"\x1B".split_first().unwrap();
179                    self.non_standard_control_escapes = tail;
180                    Some(next.into())
181                }
182                Some(ch @ ('"' | '\'' | '\\')) => Some(ch),
183                Some(ch) if ch.is_ascii() && posix_space::is_space(ch as u8) => Some(ch),
184                Some(ch) if ch.is_ascii() => {
185                    // While not an invalid byte, we rely on the documented
186                    // behavior of `InvalidUtf8ByteSequence` to always escape
187                    // any bytes given to it.
188                    self.literal = InvalidUtf8ByteSequence::with_byte(ch as u8);
189                    self.literal.next()
190                }
191                Some(ch) => Some(ch),
192                // Otherwise, we've gotten invalid UTF-8, which means this is not a
193                // printable char.
194                None => {
195                    // This conversion is safe to unwrap due to the documented
196                    // behavior of `bstr::decode_utf8` and `InvalidUtf8ByteSequence`
197                    // which indicate that `size` is always in the range of 0..=3.
198                    self.literal = InvalidUtf8ByteSequence::try_from(chunk).unwrap();
199                    // `size` is non-zero because `pattern` is non-empty.
200                    // `Literal`s created from > one byte are always non-empty.
201                    self.literal.next()
202                }
203            };
204        }
205        if let Some(suffix) = self.delimiters.emit_right_delimiter() {
206            return Some(suffix);
207        }
208        if let (Some(ch), size) = bstr::decode_utf8(self.options) {
209            self.options = &self.options[size..];
210            return Some(ch);
211        }
212        if let (Some(ch), size) = bstr::decode_utf8(self.encoding) {
213            self.encoding = &self.encoding[size..];
214            return Some(ch);
215        }
216        None
217    }
218}
219
220impl FusedIterator for Debug<'_> {}
221
222#[cfg(test)]
223mod tests {
224    use bstr::ByteSlice;
225
226    use super::Debug;
227
228    // Iterator + Collect
229
230    #[test]
231    fn iter_utf8_pattern_no_opt_no_enc() {
232        // ```ruby
233        // [2.6.6] > /Artichoke Ruby/
234        // => /Artichoke Ruby/
235        // ```
236        let debug = Debug::new(b"Artichoke Ruby", "", "");
237        let s = debug.collect::<String>();
238        assert_eq!(s, "/Artichoke Ruby/");
239    }
240
241    #[test]
242    fn iter_utf8_pattern_with_opts_no_enc() {
243        // ```ruby
244        // [2.6.6] > /Artichoke Ruby/i
245        // => /Artichoke Ruby/i
246        // ```
247        let debug = Debug::new(b"Artichoke Ruby", "i", "");
248        let s = debug.collect::<String>();
249        assert_eq!(s, "/Artichoke Ruby/i");
250
251        // ```ruby
252        // [2.6.6] > /Artichoke Ruby/mix
253        // => /Artichoke Ruby/mix
254        // ```
255        let debug = Debug::new(b"Artichoke Ruby", "mix", "");
256        let s = debug.collect::<String>();
257        assert_eq!(s, "/Artichoke Ruby/mix");
258    }
259
260    #[test]
261    fn iter_utf8_pattern_no_opts_with_enc() {
262        // ```ruby
263        // [2.6.6] > /Artichoke Ruby/n
264        // => /Artichoke Ruby/n
265        // ```
266        let debug = Debug::new(b"Artichoke Ruby", "", "n");
267        let s = debug.collect::<String>();
268        assert_eq!(s, "/Artichoke Ruby/n");
269    }
270
271    #[test]
272    fn iter_utf8_pattern_with_opts_with_enc() {
273        // ```ruby
274        // [2.6.6] > /Artichoke Ruby/nix
275        // => /Artichoke Ruby/ixn
276        // ```
277        let debug = Debug::new(b"Artichoke Ruby", "ix", "n");
278        let s = debug.collect::<String>();
279        assert_eq!(s, "/Artichoke Ruby/ixn");
280    }
281
282    #[test]
283    fn iter_utf8_emoji_pattern_no_opt_no_enc() {
284        // ```ruby
285        // [2.6.6] > /crab 🦀 for Rust/
286        // => /crab 🦀 for Rust/
287        // ```
288        let debug = Debug::new("crab 🦀 for Rust".as_bytes(), "", "");
289        let s = debug.collect::<String>();
290        assert_eq!(s, "/crab 🦀 for Rust/");
291    }
292
293    #[test]
294    fn iter_utf8_emoji_pattern_with_opts_no_enc() {
295        // ```ruby
296        // [2.6.6] > /crab 🦀 for Rust/i
297        // => /crab 🦀 for Rust/i
298        // ```
299        let debug = Debug::new("crab 🦀 for Rust".as_bytes(), "i", "");
300        let s = debug.collect::<String>();
301        assert_eq!(s, "/crab 🦀 for Rust/i");
302
303        // ```ruby
304        // [2.6.6] > /crab 🦀 for Rust/mix
305        // => /crab 🦀 for Rust/mix
306        // ```
307        let debug = Debug::new("crab 🦀 for Rust".as_bytes(), "mix", "");
308        let s = debug.collect::<String>();
309        assert_eq!(s, "/crab 🦀 for Rust/mix");
310    }
311
312    #[test]
313    fn iter_ascii_escaped_byte_pattern_literal_ascii_control() {
314        // ```ruby
315        // [3.1.2] > Regexp.compile((0..0x1F).to_a.map(&:chr).join).inspect.bytes
316        // ```
317        let pattern = (0x00..=0x1F).collect::<Vec<u8>>();
318        let debug = Debug::new(&pattern, "", "");
319        let s = debug.collect::<String>();
320        assert_eq!(
321            s.as_bytes().as_bstr(),
322            [
323                47, 92, 120, 48, 48, 92, 120, 48, 49, 92, 120, 48, 50, 92, 120, 48, 51, 92, 120, 48, 52, 92, 120, 48,
324                53, 92, 120, 48, 54, 92, 120, 48, 55, 92, 120, 48, 56, 9, 10, 11, 12, 13, 92, 120, 48, 69, 92, 120,
325                48, 70, 92, 120, 49, 48, 92, 120, 49, 49, 92, 120, 49, 50, 92, 120, 49, 51, 92, 120, 49, 52, 92, 120,
326                49, 53, 92, 120, 49, 54, 92, 120, 49, 55, 92, 120, 49, 56, 92, 120, 49, 57, 92, 120, 49, 65, 92, 120,
327                49, 66, 92, 120, 49, 67, 92, 120, 49, 68, 92, 120, 49, 69, 92, 120, 49, 70, 47_u8
328            ]
329            .as_bstr(),
330        );
331    }
332
333    #[test]
334    fn iter_ascii_pattern_exhaustive() {
335        // ```ruby
336        // Regexp.compile((0..0x7F).to_a.reject {|b| "[](){}".include?(b.chr) }.map(&:chr).join).inspect.bytes
337        // ```
338        let pattern = (0x00..=0x7F).filter(|b| !b"[](){}".contains(b)).collect::<Vec<u8>>();
339        let debug = Debug::new(&pattern, "", "");
340        let s = debug.collect::<String>();
341        assert_eq!(
342            s.as_bytes().as_bstr(),
343            [
344                47, 92, 120, 48, 48, 92, 120, 48, 49, 92, 120, 48, 50, 92, 120, 48, 51, 92, 120, 48, 52, 92, 120, 48,
345                53, 92, 120, 48, 54, 92, 120, 48, 55, 92, 120, 48, 56, 9, 10, 11, 12, 13, 92, 120, 48, 69, 92, 120,
346                48, 70, 92, 120, 49, 48, 92, 120, 49, 49, 92, 120, 49, 50, 92, 120, 49, 51, 92, 120, 49, 52, 92, 120,
347                49, 53, 92, 120, 49, 54, 92, 120, 49, 55, 92, 120, 49, 56, 92, 120, 49, 57, 92, 120, 49, 65, 92, 120,
348                49, 66, 92, 120, 49, 67, 92, 120, 49, 68, 92, 120, 49, 69, 92, 120, 49, 70, 32, 33, 34, 35, 36, 37,
349                38, 39, 42, 43, 44, 45, 46, 92, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
350                64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
351                89, 90, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
352                113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124, 126, 92, 120, 55, 70, 47_u8
353            ]
354            .as_bstr(),
355        );
356    }
357
358    #[test]
359    fn iter_ascii_pattern_escaped_exhaustive() {
360        // ```ruby
361        // Regexp.escape((0..0x7F).to_a.map(&:chr).join).bytes
362        // Regexp.compile(Regexp.escape((0..0x7F).to_a.map(&:chr).join)).inspect.bytes
363        // ```
364        let pattern = &[
365            0, 1, 2, 3, 4, 5, 6, 7, 8, 92, 116, 92, 110, 92, 118, 92, 102, 92, 114, 14, 15, 16, 17, 18, 19, 20, 21,
366            22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 92, 32, 33, 34, 92, 35, 92, 36, 37, 38, 39, 92, 40, 92, 41, 92,
367            42, 92, 43, 44, 92, 45, 92, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 92, 63,
368            64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
369            90, 92, 91, 92, 92, 92, 93, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
370            110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 92, 123, 92, 124, 92, 125, 126, 127_u8,
371        ];
372        let debug = Debug::new(pattern, "", "");
373        let s = debug.collect::<String>();
374        assert_eq!(
375            s.as_bytes().as_bstr(),
376            [
377                47, 92, 120, 48, 48, 92, 120, 48, 49, 92, 120, 48, 50, 92, 120, 48, 51, 92, 120, 48, 52, 92, 120, 48,
378                53, 92, 120, 48, 54, 92, 120, 48, 55, 92, 120, 48, 56, 92, 116, 92, 110, 92, 118, 92, 102, 92, 114,
379                92, 120, 48, 69, 92, 120, 48, 70, 92, 120, 49, 48, 92, 120, 49, 49, 92, 120, 49, 50, 92, 120, 49, 51,
380                92, 120, 49, 52, 92, 120, 49, 53, 92, 120, 49, 54, 92, 120, 49, 55, 92, 120, 49, 56, 92, 120, 49, 57,
381                92, 120, 49, 65, 92, 120, 49, 66, 92, 120, 49, 67, 92, 120, 49, 68, 92, 120, 49, 69, 92, 120, 49, 70,
382                92, 32, 33, 34, 92, 35, 92, 36, 37, 38, 39, 92, 40, 92, 41, 92, 42, 92, 43, 44, 92, 45, 92, 46, 92,
383                47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 92, 63, 64, 65, 66, 67, 68, 69, 70,
384                71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 92, 91, 92, 92, 92,
385                93, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
386                114, 115, 116, 117, 118, 119, 120, 121, 122, 92, 123, 92, 124, 92, 125, 126, 92, 120, 55, 70, 47_u8
387            ]
388            .as_bstr(),
389        );
390    }
391
392    #[test]
393    fn iter_ascii_escaped_byte_pattern_literal() {
394        // ```ruby
395        // [2.6.6] > /\t\v\f\n/
396        // => /\t\v\f\n/
397        // [2.6.6] > /\t\v\f\n/.source.bytes
398        // => [92, 116, 92, 118, 92, 102, 92, 110]
399        // ```
400        let pattern = [92, 116, 92, 118, 92, 102, 92, 110];
401        let debug = Debug::new(&pattern, "", "");
402        let s = debug.collect::<String>();
403        assert_eq!(s, r"/\t\v\f\n/");
404
405        // ```ruby
406        // [2.6.6] > /\t\v\f\n/i
407        // => /\t\v\f\n/i
408        // ```
409        let debug = Debug::new(br"\t\v\f\n", "i", "");
410        let s = debug.collect::<String>();
411        assert_eq!(s, r"/\t\v\f\n/i");
412
413        // ```ruby
414        // [2.6.6] > /\t\v\f\n/mix
415        // => /\t\v\f\n/mix
416        // ```
417        let debug = Debug::new(br"\t\v\f\n", "mix", "");
418        let s = debug.collect::<String>();
419        assert_eq!(s, r"/\t\v\f\n/mix");
420
421        // ```ruby
422        // [2.6.6] > /\t\v\f\n/n
423        // => /\t\v\f\n/n
424        // ```
425        let debug = Debug::new(br"\t\v\f\n", "", "n");
426        let s = debug.collect::<String>();
427        assert_eq!(s, r"/\t\v\f\n/n");
428
429        // ```ruby
430        // [2.6.6] > /\t\v\f\n/nix
431        // => /\t\v\f\n/ixn
432        // ```
433        let debug = Debug::new(br"\t\v\f\n", "ix", "n");
434        let s = debug.collect::<String>();
435        assert_eq!(s, r"/\t\v\f\n/ixn");
436    }
437
438    #[test]
439    fn iter_ascii_escaped_byte_pattern_compiled() {
440        // ```ruby
441        // [2.6.6] > Regexp.compile('      "')
442        // => /	"/
443        // [2.6.6] > Regexp.compile('      "').source.bytes
444        // => [9, 34]
445        // ```
446        let pattern = [9, 34];
447        let debug = Debug::new(&pattern, "", "");
448        let s = debug.collect::<String>();
449        assert_eq!(s, "/\t\"/");
450    }
451
452    #[test]
453    fn iter_invalid_utf8_pattern() {
454        // ```ruby
455        // [2.6.6] > Regexp.compile("\xFF\xFE".force_encoding(Encoding::BINARY))
456        // => /\xFF\xFE/
457        // ```
458        let debug = Debug::new(b"\xFF\xFE", "", "");
459        let s = debug.collect::<String>();
460        assert_eq!(s, r"/\xFF\xFE/");
461    }
462}