unicode_segmentation/
grapheme.rs

1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12
13use crate::tables::grapheme::GraphemeCat;
14
15/// External iterator for grapheme clusters and byte offsets.
16///
17/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
18/// trait. See its documentation for more.
19///
20/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
21/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
22#[derive(Debug, Clone)]
23pub struct GraphemeIndices<'a> {
24    start_offset: usize,
25    iter: Graphemes<'a>,
26}
27
28impl<'a> GraphemeIndices<'a> {
29    #[inline]
30    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
31    ///
32    /// ```rust
33    /// # use unicode_segmentation::UnicodeSegmentation;
34    /// let mut iter = "abc".grapheme_indices(true);
35    /// assert_eq!(iter.as_str(), "abc");
36    /// iter.next();
37    /// assert_eq!(iter.as_str(), "bc");
38    /// iter.next();
39    /// iter.next();
40    /// assert_eq!(iter.as_str(), "");
41    /// ```
42    pub fn as_str(&self) -> &'a str {
43        self.iter.as_str()
44    }
45}
46
47impl<'a> Iterator for GraphemeIndices<'a> {
48    type Item = (usize, &'a str);
49
50    #[inline]
51    fn next(&mut self) -> Option<(usize, &'a str)> {
52        self.iter
53            .next()
54            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
55    }
56
57    #[inline]
58    fn size_hint(&self) -> (usize, Option<usize>) {
59        self.iter.size_hint()
60    }
61}
62
63impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
64    #[inline]
65    fn next_back(&mut self) -> Option<(usize, &'a str)> {
66        self.iter
67            .next_back()
68            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
69    }
70}
71
72/// External iterator for a string's
73/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
74///
75/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
76/// documentation for more.
77///
78/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
79/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
80#[derive(Clone, Debug)]
81pub struct Graphemes<'a> {
82    string: &'a str,
83    cursor: GraphemeCursor,
84    cursor_back: GraphemeCursor,
85}
86
87impl<'a> Graphemes<'a> {
88    #[inline]
89    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
90    ///
91    /// ```rust
92    /// # use unicode_segmentation::UnicodeSegmentation;
93    /// let mut iter = "abc".graphemes(true);
94    /// assert_eq!(iter.as_str(), "abc");
95    /// iter.next();
96    /// assert_eq!(iter.as_str(), "bc");
97    /// iter.next();
98    /// iter.next();
99    /// assert_eq!(iter.as_str(), "");
100    /// ```
101    pub fn as_str(&self) -> &'a str {
102        &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
103    }
104}
105
106impl<'a> Iterator for Graphemes<'a> {
107    type Item = &'a str;
108
109    #[inline]
110    fn size_hint(&self) -> (usize, Option<usize>) {
111        let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
112        (cmp::min(slen, 1), Some(slen))
113    }
114
115    #[inline]
116    fn next(&mut self) -> Option<&'a str> {
117        let start = self.cursor.cur_cursor();
118        if start == self.cursor_back.cur_cursor() {
119            return None;
120        }
121        let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
122        Some(&self.string[start..next])
123    }
124}
125
126impl<'a> DoubleEndedIterator for Graphemes<'a> {
127    #[inline]
128    fn next_back(&mut self) -> Option<&'a str> {
129        let end = self.cursor_back.cur_cursor();
130        if end == self.cursor.cur_cursor() {
131            return None;
132        }
133        let prev = self
134            .cursor_back
135            .prev_boundary(self.string, 0)
136            .unwrap()
137            .unwrap();
138        Some(&self.string[prev..end])
139    }
140}
141
142#[inline]
143pub fn new_graphemes(s: &str, is_extended: bool) -> Graphemes<'_> {
144    let len = s.len();
145    Graphemes {
146        string: s,
147        cursor: GraphemeCursor::new(0, len, is_extended),
148        cursor_back: GraphemeCursor::new(len, len, is_extended),
149    }
150}
151
152#[inline]
153pub fn new_grapheme_indices(s: &str, is_extended: bool) -> GraphemeIndices<'_> {
154    GraphemeIndices {
155        start_offset: s.as_ptr() as usize,
156        iter: new_graphemes(s, is_extended),
157    }
158}
159
160/// maybe unify with PairResult?
161/// An enum describing information about a potential boundary.
162#[derive(PartialEq, Eq, Clone, Debug)]
163enum GraphemeState {
164    /// No information is known.
165    Unknown,
166    /// It is known to not be a boundary.
167    NotBreak,
168    /// It is known to be a boundary.
169    Break,
170    /// The codepoint after it has Indic_Conjunct_Break=Consonant,
171    /// so there is a break before so a boundary if it is preceded by another
172    /// InCB=Consonant follwoed by a sequence consisting of one or more InCB=Linker
173    /// and zero or more InCB = Extend (in any order).
174    InCbConsonant,
175    /// The codepoint after is a Regional Indicator Symbol, so a boundary iff
176    /// it is preceded by an even number of RIS codepoints. (GB12, GB13)
177    Regional,
178    /// The codepoint after is Extended_Pictographic,
179    /// so whether it's a boundary depends on pre-context according to GB11.
180    Emoji,
181}
182
183/// Cursor-based segmenter for grapheme clusters.
184///
185/// This allows working with ropes and other datastructures where the string is not contiguous or
186/// fully known at initialization time.
187#[derive(Clone, Debug)]
188pub struct GraphemeCursor {
189    /// Current cursor position.
190    offset: usize,
191    /// Total length of the string.
192    len: usize,
193    /// A config flag indicating whether this cursor computes legacy or extended
194    /// grapheme cluster boundaries (enables GB9a and GB9b if set).
195    is_extended: bool,
196    /// Information about the potential boundary at `offset`
197    state: GraphemeState,
198    /// Category of codepoint immediately preceding cursor, if known.
199    cat_before: Option<GraphemeCat>,
200    /// Category of codepoint immediately after cursor, if known.
201    cat_after: Option<GraphemeCat>,
202    /// If set, at least one more codepoint immediately preceding this offset
203    /// is needed to resolve whether there's a boundary at `offset`.
204    pre_context_offset: Option<usize>,
205    /// The number of `InCB=Linker` codepoints preceding `offset`
206    /// (potentially intermingled with `InCB=Extend`).
207    incb_linker_count: Option<usize>,
208    /// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
209    /// is set, then counts the number of RIS between that and `offset`, otherwise
210    /// is an accurate count relative to the string.
211    ris_count: Option<usize>,
212    /// Set if a call to `prev_boundary` or `next_boundary` was suspended due
213    /// to needing more input.
214    resuming: bool,
215    /// Cached grapheme category and associated scalar value range.
216    grapheme_cat_cache: (u32, u32, GraphemeCat),
217}
218
219/// An error return indicating that not enough content was available in the
220/// provided chunk to satisfy the query, and that more content must be provided.
221#[derive(PartialEq, Eq, Debug)]
222pub enum GraphemeIncomplete {
223    /// More pre-context is needed. The caller should call `provide_context`
224    /// with a chunk ending at the offset given, then retry the query. This
225    /// will only be returned if the `chunk_start` parameter is nonzero.
226    PreContext(usize),
227
228    /// When requesting `prev_boundary`, the cursor is moving past the beginning
229    /// of the current chunk, so the chunk before that is requested. This will
230    /// only be returned if the `chunk_start` parameter is nonzero.
231    PrevChunk,
232
233    /// When requesting `next_boundary`, the cursor is moving past the end of the
234    /// current chunk, so the chunk after that is requested. This will only be
235    /// returned if the chunk ends before the `len` parameter provided on
236    /// creation of the cursor.
237    NextChunk, // requesting chunk following the one given
238
239    /// An error returned when the chunk given does not contain the cursor position.
240    InvalidOffset,
241}
242
243// An enum describing the result from lookup of a pair of categories.
244#[derive(PartialEq, Eq)]
245enum PairResult {
246    /// definitely not a break
247    NotBreak,
248    /// definitely a break
249    Break,
250    /// a break iff not in extended mode
251    Extended,
252    /// a break unless in extended mode and preceded by
253    /// a sequence of 0 or more InCB=Extend and one or more
254    /// InCB = Linker (in any order),
255    /// preceded by another InCB=Consonant
256    InCbConsonant,
257    /// a break if preceded by an even number of RIS
258    Regional,
259    /// a break if preceded by emoji base and (Extend)*
260    Emoji,
261}
262
263#[inline]
264fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
265    use self::PairResult::*;
266    use crate::tables::grapheme::GraphemeCat::*;
267    match (before, after) {
268        (GC_CR, GC_LF) => NotBreak,                                 // GB3
269        (GC_Control | GC_CR | GC_LF, _) => Break,                   // GB4
270        (_, GC_Control | GC_CR | GC_LF) => Break,                   // GB5
271        (GC_L, GC_L | GC_V | GC_LV | GC_LVT) => NotBreak,           // GB6
272        (GC_LV | GC_V, GC_V | GC_T) => NotBreak,                    // GB7
273        (GC_LVT | GC_T, GC_T) => NotBreak,                          // GB8
274        (_, GC_Extend | GC_ZWJ) => NotBreak,                        // GB9
275        (_, GC_SpacingMark) => Extended,                            // GB9a
276        (GC_Prepend, _) => Extended,                                // GB9b
277        (_, GC_InCB_Consonant) => InCbConsonant,                    // GB9c
278        (GC_ZWJ, GC_Extended_Pictographic) => Emoji,                // GB11
279        (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
280        (_, _) => Break,                                            // GB999
281    }
282}
283
284impl GraphemeCursor {
285    /// Create a new cursor. The string and initial offset are given at creation
286    /// time, but the contents of the string are not. The `is_extended` parameter
287    /// controls whether extended grapheme clusters are selected.
288    ///
289    /// The `offset` parameter must be on a codepoint boundary.
290    ///
291    /// ```rust
292    /// # use unicode_segmentation::GraphemeCursor;
293    /// let s = "हिन्दी";
294    /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
295    /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
296    /// let mut extended = GraphemeCursor::new(0, s.len(), true);
297    /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
298    /// ```
299    pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
300        let state = if offset == 0 || offset == len {
301            GraphemeState::Break
302        } else {
303            GraphemeState::Unknown
304        };
305        GraphemeCursor {
306            offset,
307            len,
308            state,
309            is_extended,
310            cat_before: None,
311            cat_after: None,
312            pre_context_offset: None,
313            incb_linker_count: None,
314            ris_count: None,
315            resuming: false,
316            grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
317        }
318    }
319
320    fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
321        use crate::tables::grapheme as gr;
322        use crate::tables::grapheme::GraphemeCat::*;
323
324        if ch <= '\u{7e}' {
325            // Special-case optimization for ascii, except U+007F.  This
326            // improves performance even for many primarily non-ascii texts,
327            // due to use of punctuation and white space characters from the
328            // ascii range.
329            if ch >= '\u{20}' {
330                GC_Any
331            } else if ch == '\n' {
332                GC_LF
333            } else if ch == '\r' {
334                GC_CR
335            } else {
336                GC_Control
337            }
338        } else {
339            // If this char isn't within the cached range, update the cache to the
340            // range that includes it.
341            if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
342                self.grapheme_cat_cache = gr::grapheme_category(ch);
343            }
344            self.grapheme_cat_cache.2
345        }
346    }
347
348    // Not sure I'm gonna keep this, the advantage over new() seems thin.
349
350    /// Set the cursor to a new location in the same string.
351    ///
352    /// ```rust
353    /// # use unicode_segmentation::GraphemeCursor;
354    /// let s = "abcd";
355    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
356    /// assert_eq!(cursor.cur_cursor(), 0);
357    /// cursor.set_cursor(2);
358    /// assert_eq!(cursor.cur_cursor(), 2);
359    /// ```
360    pub fn set_cursor(&mut self, offset: usize) {
361        if offset != self.offset {
362            self.offset = offset;
363            self.state = if offset == 0 || offset == self.len {
364                GraphemeState::Break
365            } else {
366                GraphemeState::Unknown
367            };
368            // reset state derived from text around cursor
369            self.cat_before = None;
370            self.cat_after = None;
371            self.incb_linker_count = None;
372            self.ris_count = None;
373        }
374    }
375
376    #[inline]
377    /// The current offset of the cursor. Equal to the last value provided to
378    /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
379    /// `prev_boundary()`.
380    ///
381    /// ```rust
382    /// # use unicode_segmentation::GraphemeCursor;
383    /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
384    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
385    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
386    /// assert_eq!(cursor.cur_cursor(), 4);
387    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
388    /// assert_eq!(cursor.cur_cursor(), 8);
389    /// ```
390    pub fn cur_cursor(&self) -> usize {
391        self.offset
392    }
393
394    /// Provide additional pre-context when it is needed to decide a boundary.
395    /// The end of the chunk must coincide with the value given in the
396    /// `GraphemeIncomplete::PreContext` request.
397    ///
398    /// ```rust
399    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
400    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
401    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
402    /// // Not enough pre-context to decide if there's a boundary between the two flags.
403    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
404    /// // Provide one more Regional Indicator Symbol of pre-context
405    /// cursor.provide_context(&flags[4..8], 4);
406    /// // Still not enough context to decide.
407    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
408    /// // Provide additional requested context.
409    /// cursor.provide_context(&flags[0..4], 0);
410    /// // That's enough to decide (it always is when context goes to the start of the string)
411    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
412    /// ```
413    pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
414        use crate::tables::grapheme as gr;
415        assert!(chunk_start.saturating_add(chunk.len()) == self.pre_context_offset.unwrap());
416        self.pre_context_offset = None;
417        if self.is_extended && chunk_start + chunk.len() == self.offset {
418            let ch = chunk.chars().next_back().unwrap();
419            if self.grapheme_category(ch) == gr::GC_Prepend {
420                self.decide(false); // GB9b
421                return;
422            }
423        }
424        match self.state {
425            GraphemeState::InCbConsonant => self.handle_incb_consonant(chunk, chunk_start),
426            GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
427            GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
428            _ => {
429                if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
430                    let ch = chunk.chars().next_back().unwrap();
431                    self.cat_before = Some(self.grapheme_category(ch));
432                }
433            }
434        }
435    }
436
437    #[inline]
438    fn decide(&mut self, is_break: bool) {
439        self.state = if is_break {
440            GraphemeState::Break
441        } else {
442            GraphemeState::NotBreak
443        };
444    }
445
446    #[inline]
447    fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
448        self.decide(is_break);
449        Ok(is_break)
450    }
451
452    #[inline]
453    fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
454        if self.state == GraphemeState::Break {
455            Ok(true)
456        } else if self.state == GraphemeState::NotBreak {
457            Ok(false)
458        } else if let Some(pre_context_offset) = self.pre_context_offset {
459            Err(GraphemeIncomplete::PreContext(pre_context_offset))
460        } else {
461            unreachable!("inconsistent state");
462        }
463    }
464
465    /// For handling rule GB9c:
466    ///
467    /// There's an `InCB=Consonant` after this, and we need to look back
468    /// to verify whether there should be a break.
469    ///
470    /// Seek backward to find an `InCB=Linker` preceded by an `InCB=Consonsnt`
471    /// (potentially separated by some number of `InCB=Linker` or `InCB=Extend`).
472    /// If we find the consonant in question, then there's no break; if we find a consonant
473    /// with no linker, or a non-linker non-extend non-consonant, or the start of text, there's a break;
474    /// otherwise we need more context
475    #[inline]
476    fn handle_incb_consonant(&mut self, chunk: &str, chunk_start: usize) {
477        use crate::tables::{self, grapheme as gr};
478
479        // GB9c only applies to extended grapheme clusters
480        if !self.is_extended {
481            self.decide(true);
482            return;
483        }
484
485        let mut incb_linker_count = self.incb_linker_count.unwrap_or(0);
486
487        for ch in chunk.chars().rev() {
488            if tables::is_incb_linker(ch) {
489                // We found an InCB linker
490                incb_linker_count += 1;
491                self.incb_linker_count = Some(incb_linker_count);
492            } else if tables::derived_property::InCB_Extend(ch) {
493                // We ignore InCB extends, continue
494            } else {
495                // Prev character is neither linker nor extend, break suppressed iff it's InCB=Consonant
496                let result = !(self.incb_linker_count.unwrap_or(0) > 0
497                    && self.grapheme_category(ch) == gr::GC_InCB_Consonant);
498                self.decide(result);
499                return;
500            }
501        }
502
503        if chunk_start == 0 {
504            // Start of text and we still haven't found a consonant, so break
505            self.decide(true);
506        } else {
507            // We need more context
508            self.pre_context_offset = Some(chunk_start);
509            self.state = GraphemeState::InCbConsonant;
510        }
511    }
512
513    #[inline]
514    fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
515        use crate::tables::grapheme as gr;
516        let mut ris_count = self.ris_count.unwrap_or(0);
517        for ch in chunk.chars().rev() {
518            if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
519                self.ris_count = Some(ris_count);
520                self.decide((ris_count % 2) == 0);
521                return;
522            }
523            ris_count += 1;
524        }
525        self.ris_count = Some(ris_count);
526        if chunk_start == 0 {
527            self.decide((ris_count % 2) == 0);
528        } else {
529            self.pre_context_offset = Some(chunk_start);
530            self.state = GraphemeState::Regional;
531        }
532    }
533
534    #[inline]
535    fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
536        use crate::tables::grapheme as gr;
537        let mut iter = chunk.chars().rev();
538        if let Some(ch) = iter.next() {
539            if self.grapheme_category(ch) != gr::GC_ZWJ {
540                self.decide(true);
541                return;
542            }
543        }
544        for ch in iter {
545            match self.grapheme_category(ch) {
546                gr::GC_Extend => (),
547                gr::GC_Extended_Pictographic => {
548                    self.decide(false);
549                    return;
550                }
551                _ => {
552                    self.decide(true);
553                    return;
554                }
555            }
556        }
557        if chunk_start == 0 {
558            self.decide(true);
559        } else {
560            self.pre_context_offset = Some(chunk_start);
561            self.state = GraphemeState::Emoji;
562        }
563    }
564
565    #[inline]
566    /// Determine whether the current cursor location is a grapheme cluster boundary.
567    /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
568    /// the length of `chunk` is not equal to `len` on creation, then this method
569    /// may return `GraphemeIncomplete::PreContext`. The caller should then
570    /// call `provide_context` with the requested chunk, then retry calling this
571    /// method.
572    ///
573    /// For partial chunks, if the cursor is not at the beginning or end of the
574    /// string, the chunk should contain at least the codepoint following the cursor.
575    /// If the string is nonempty, the chunk must be nonempty.
576    ///
577    /// All calls should have consistent chunk contents (ie, if a chunk provides
578    /// content for a given slice, all further chunks covering that slice must have
579    /// the same content for it).
580    ///
581    /// ```rust
582    /// # use unicode_segmentation::GraphemeCursor;
583    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
584    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
585    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
586    /// cursor.set_cursor(12);
587    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
588    /// ```
589    pub fn is_boundary(
590        &mut self,
591        chunk: &str,
592        chunk_start: usize,
593    ) -> Result<bool, GraphemeIncomplete> {
594        use crate::tables::grapheme as gr;
595        if self.state == GraphemeState::Break {
596            return Ok(true);
597        }
598        if self.state == GraphemeState::NotBreak {
599            return Ok(false);
600        }
601        if (self.offset < chunk_start || self.offset >= chunk_start.saturating_add(chunk.len()))
602            && (self.offset > chunk_start.saturating_add(chunk.len()) || self.cat_after.is_none())
603        {
604            return Err(GraphemeIncomplete::InvalidOffset);
605        }
606        if let Some(pre_context_offset) = self.pre_context_offset {
607            return Err(GraphemeIncomplete::PreContext(pre_context_offset));
608        }
609        let offset_in_chunk = self.offset.saturating_sub(chunk_start);
610        if self.cat_after.is_none() {
611            let ch = chunk[offset_in_chunk..].chars().next().unwrap();
612            self.cat_after = Some(self.grapheme_category(ch));
613        }
614        if self.offset == chunk_start {
615            let mut need_pre_context = true;
616            match self.cat_after.unwrap() {
617                gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant,
618                gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
619                gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
620                _ => need_pre_context = self.cat_before.is_none(),
621            }
622            if need_pre_context {
623                self.pre_context_offset = Some(chunk_start);
624                return Err(GraphemeIncomplete::PreContext(chunk_start));
625            }
626        }
627        if self.cat_before.is_none() {
628            let ch = chunk[..offset_in_chunk].chars().next_back().unwrap();
629            self.cat_before = Some(self.grapheme_category(ch));
630        }
631        match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
632            PairResult::NotBreak => self.decision(false),
633            PairResult::Break => self.decision(true),
634            PairResult::Extended => {
635                let is_extended = self.is_extended;
636                self.decision(!is_extended)
637            }
638            PairResult::InCbConsonant => {
639                self.handle_incb_consonant(&chunk[..offset_in_chunk], chunk_start);
640                self.is_boundary_result()
641            }
642            PairResult::Regional => {
643                if let Some(ris_count) = self.ris_count {
644                    return self.decision((ris_count % 2) == 0);
645                }
646                self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
647                self.is_boundary_result()
648            }
649            PairResult::Emoji => {
650                self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
651                self.is_boundary_result()
652            }
653        }
654    }
655
656    #[inline]
657    /// Find the next boundary after the current cursor position. Only a part of
658    /// the string need be supplied. If the chunk is incomplete, then this
659    /// method might return `GraphemeIncomplete::PreContext` or
660    /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
661    /// call `provide_context` with the requested chunk, then retry. In the
662    /// latter case, the caller should provide the chunk following the one
663    /// given, then retry.
664    ///
665    /// See `is_boundary` for expectations on the provided chunk.
666    ///
667    /// ```rust
668    /// # use unicode_segmentation::GraphemeCursor;
669    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
670    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
671    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
672    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
673    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
674    /// ```
675    ///
676    /// And an example that uses partial strings:
677    ///
678    /// ```rust
679    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
680    /// let s = "abcd";
681    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
682    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
683    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
684    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
685    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
686    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
687    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
688    /// ```
689    pub fn next_boundary(
690        &mut self,
691        chunk: &str,
692        chunk_start: usize,
693    ) -> Result<Option<usize>, GraphemeIncomplete> {
694        if self.offset == self.len {
695            return Ok(None);
696        }
697        let mut iter = chunk[self.offset.saturating_sub(chunk_start)..].chars();
698        let mut ch = match iter.next() {
699            Some(ch) => ch,
700            None => return Err(GraphemeIncomplete::NextChunk),
701        };
702        loop {
703            if self.resuming {
704                if self.cat_after.is_none() {
705                    self.cat_after = Some(self.grapheme_category(ch));
706                }
707            } else {
708                self.offset = self.offset.saturating_add(ch.len_utf8());
709                self.state = GraphemeState::Unknown;
710                self.cat_before = self.cat_after.take();
711                if self.cat_before.is_none() {
712                    self.cat_before = Some(self.grapheme_category(ch));
713                }
714                if crate::tables::is_incb_linker(ch) {
715                    self.incb_linker_count = Some(self.incb_linker_count.map_or(1, |c| c + 1));
716                } else if !crate::tables::derived_property::InCB_Extend(ch) {
717                    self.incb_linker_count = Some(0);
718                }
719                if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
720                    self.ris_count = self.ris_count.map(|c| c + 1);
721                } else {
722                    self.ris_count = Some(0);
723                }
724                if let Some(next_ch) = iter.next() {
725                    ch = next_ch;
726                    self.cat_after = Some(self.grapheme_category(ch));
727                } else if self.offset == self.len {
728                    self.decide(true);
729                } else {
730                    self.resuming = true;
731                    return Err(GraphemeIncomplete::NextChunk);
732                }
733            }
734            self.resuming = true;
735            if self.is_boundary(chunk, chunk_start)? {
736                self.resuming = false;
737                return Ok(Some(self.offset));
738            }
739            self.resuming = false;
740        }
741    }
742
743    /// Find the previous boundary after the current cursor position. Only a part
744    /// of the string need be supplied. If the chunk is incomplete, then this
745    /// method might return `GraphemeIncomplete::PreContext` or
746    /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
747    /// call `provide_context` with the requested chunk, then retry. In the
748    /// latter case, the caller should provide the chunk preceding the one
749    /// given, then retry.
750    ///
751    /// See `is_boundary` for expectations on the provided chunk.
752    ///
753    /// ```rust
754    /// # use unicode_segmentation::GraphemeCursor;
755    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
756    /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
757    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
758    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
759    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
760    /// ```
761    ///
762    /// And an example that uses partial strings (note the exact return is not
763    /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
764    ///
765    /// ```rust
766    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
767    /// let s = "abcd";
768    /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
769    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
770    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
771    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
772    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
773    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
774    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
775    /// ```
776    pub fn prev_boundary(
777        &mut self,
778        chunk: &str,
779        chunk_start: usize,
780    ) -> Result<Option<usize>, GraphemeIncomplete> {
781        if self.offset == 0 {
782            return Ok(None);
783        }
784        if self.offset == chunk_start {
785            return Err(GraphemeIncomplete::PrevChunk);
786        }
787        let mut iter = chunk[..self.offset.saturating_sub(chunk_start)]
788            .chars()
789            .rev();
790        let mut ch = iter.next().unwrap();
791        loop {
792            if self.offset == chunk_start {
793                self.resuming = true;
794                return Err(GraphemeIncomplete::PrevChunk);
795            }
796            if self.resuming {
797                self.cat_before = Some(self.grapheme_category(ch));
798            } else {
799                self.offset -= ch.len_utf8();
800                self.cat_after = self.cat_before.take();
801                self.state = GraphemeState::Unknown;
802                if let Some(incb_linker_count) = self.incb_linker_count {
803                    self.ris_count = if incb_linker_count > 0 && crate::tables::is_incb_linker(ch) {
804                        Some(incb_linker_count - 1)
805                    } else if crate::tables::derived_property::InCB_Extend(ch) {
806                        Some(incb_linker_count)
807                    } else {
808                        None
809                    };
810                }
811                if let Some(ris_count) = self.ris_count {
812                    self.ris_count = if ris_count > 0 {
813                        Some(ris_count - 1)
814                    } else {
815                        None
816                    };
817                }
818                if let Some(prev_ch) = iter.next() {
819                    ch = prev_ch;
820                    self.cat_before = Some(self.grapheme_category(ch));
821                } else if self.offset == 0 {
822                    self.decide(true);
823                } else {
824                    self.resuming = true;
825                    self.cat_after = Some(self.grapheme_category(ch));
826                    return Err(GraphemeIncomplete::PrevChunk);
827                }
828            }
829            self.resuming = true;
830            if self.is_boundary(chunk, chunk_start)? {
831                self.resuming = false;
832                return Ok(Some(self.offset));
833            }
834            self.resuming = false;
835        }
836    }
837}
838
839#[test]
840fn test_grapheme_cursor_ris_precontext() {
841    let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
842    let mut c = GraphemeCursor::new(8, s.len(), true);
843    assert_eq!(
844        c.is_boundary(&s[4..], 4),
845        Err(GraphemeIncomplete::PreContext(4))
846    );
847    c.provide_context(&s[..4], 0);
848    assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
849}
850
851#[test]
852fn test_grapheme_cursor_chunk_start_require_precontext() {
853    let s = "\r\n";
854    let mut c = GraphemeCursor::new(1, s.len(), true);
855    assert_eq!(
856        c.is_boundary(&s[1..], 1),
857        Err(GraphemeIncomplete::PreContext(1))
858    );
859    c.provide_context(&s[..1], 0);
860    assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
861}
862
863#[test]
864fn test_grapheme_cursor_prev_boundary() {
865    let s = "abcd";
866    let mut c = GraphemeCursor::new(3, s.len(), true);
867    assert_eq!(
868        c.prev_boundary(&s[2..], 2),
869        Err(GraphemeIncomplete::PrevChunk)
870    );
871    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
872}
873
874#[test]
875fn test_grapheme_cursor_prev_boundary_chunk_start() {
876    let s = "abcd";
877    let mut c = GraphemeCursor::new(2, s.len(), true);
878    assert_eq!(
879        c.prev_boundary(&s[2..], 2),
880        Err(GraphemeIncomplete::PrevChunk)
881    );
882    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
883}