unicode_segmentation/grapheme.rs
1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12
13use crate::tables::grapheme::GraphemeCat;
14
15/// External iterator for grapheme clusters and byte offsets.
16///
17/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
18/// trait. See its documentation for more.
19///
20/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
21/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
22#[derive(Debug, Clone)]
23pub struct GraphemeIndices<'a> {
24 start_offset: usize,
25 iter: Graphemes<'a>,
26}
27
28impl<'a> GraphemeIndices<'a> {
29 #[inline]
30 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
31 ///
32 /// ```rust
33 /// # use unicode_segmentation::UnicodeSegmentation;
34 /// let mut iter = "abc".grapheme_indices(true);
35 /// assert_eq!(iter.as_str(), "abc");
36 /// iter.next();
37 /// assert_eq!(iter.as_str(), "bc");
38 /// iter.next();
39 /// iter.next();
40 /// assert_eq!(iter.as_str(), "");
41 /// ```
42 pub fn as_str(&self) -> &'a str {
43 self.iter.as_str()
44 }
45}
46
47impl<'a> Iterator for GraphemeIndices<'a> {
48 type Item = (usize, &'a str);
49
50 #[inline]
51 fn next(&mut self) -> Option<(usize, &'a str)> {
52 self.iter
53 .next()
54 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
55 }
56
57 #[inline]
58 fn size_hint(&self) -> (usize, Option<usize>) {
59 self.iter.size_hint()
60 }
61}
62
63impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
64 #[inline]
65 fn next_back(&mut self) -> Option<(usize, &'a str)> {
66 self.iter
67 .next_back()
68 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
69 }
70}
71
72/// External iterator for a string's
73/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
74///
75/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
76/// documentation for more.
77///
78/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
79/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
80#[derive(Clone, Debug)]
81pub struct Graphemes<'a> {
82 string: &'a str,
83 cursor: GraphemeCursor,
84 cursor_back: GraphemeCursor,
85}
86
87impl<'a> Graphemes<'a> {
88 #[inline]
89 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
90 ///
91 /// ```rust
92 /// # use unicode_segmentation::UnicodeSegmentation;
93 /// let mut iter = "abc".graphemes(true);
94 /// assert_eq!(iter.as_str(), "abc");
95 /// iter.next();
96 /// assert_eq!(iter.as_str(), "bc");
97 /// iter.next();
98 /// iter.next();
99 /// assert_eq!(iter.as_str(), "");
100 /// ```
101 pub fn as_str(&self) -> &'a str {
102 &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
103 }
104}
105
106impl<'a> Iterator for Graphemes<'a> {
107 type Item = &'a str;
108
109 #[inline]
110 fn size_hint(&self) -> (usize, Option<usize>) {
111 let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
112 (cmp::min(slen, 1), Some(slen))
113 }
114
115 #[inline]
116 fn next(&mut self) -> Option<&'a str> {
117 let start = self.cursor.cur_cursor();
118 if start == self.cursor_back.cur_cursor() {
119 return None;
120 }
121 let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
122 Some(&self.string[start..next])
123 }
124}
125
126impl<'a> DoubleEndedIterator for Graphemes<'a> {
127 #[inline]
128 fn next_back(&mut self) -> Option<&'a str> {
129 let end = self.cursor_back.cur_cursor();
130 if end == self.cursor.cur_cursor() {
131 return None;
132 }
133 let prev = self
134 .cursor_back
135 .prev_boundary(self.string, 0)
136 .unwrap()
137 .unwrap();
138 Some(&self.string[prev..end])
139 }
140}
141
142#[inline]
143pub fn new_graphemes(s: &str, is_extended: bool) -> Graphemes<'_> {
144 let len = s.len();
145 Graphemes {
146 string: s,
147 cursor: GraphemeCursor::new(0, len, is_extended),
148 cursor_back: GraphemeCursor::new(len, len, is_extended),
149 }
150}
151
152#[inline]
153pub fn new_grapheme_indices(s: &str, is_extended: bool) -> GraphemeIndices<'_> {
154 GraphemeIndices {
155 start_offset: s.as_ptr() as usize,
156 iter: new_graphemes(s, is_extended),
157 }
158}
159
160/// maybe unify with PairResult?
161/// An enum describing information about a potential boundary.
162#[derive(PartialEq, Eq, Clone, Debug)]
163enum GraphemeState {
164 /// No information is known.
165 Unknown,
166 /// It is known to not be a boundary.
167 NotBreak,
168 /// It is known to be a boundary.
169 Break,
170 /// The codepoint after it has Indic_Conjunct_Break=Consonant,
171 /// so there is a break before so a boundary if it is preceded by another
172 /// InCB=Consonant follwoed by a sequence consisting of one or more InCB=Linker
173 /// and zero or more InCB = Extend (in any order).
174 InCbConsonant,
175 /// The codepoint after is a Regional Indicator Symbol, so a boundary iff
176 /// it is preceded by an even number of RIS codepoints. (GB12, GB13)
177 Regional,
178 /// The codepoint after is Extended_Pictographic,
179 /// so whether it's a boundary depends on pre-context according to GB11.
180 Emoji,
181}
182
183/// Cursor-based segmenter for grapheme clusters.
184///
185/// This allows working with ropes and other datastructures where the string is not contiguous or
186/// fully known at initialization time.
187#[derive(Clone, Debug)]
188pub struct GraphemeCursor {
189 /// Current cursor position.
190 offset: usize,
191 /// Total length of the string.
192 len: usize,
193 /// A config flag indicating whether this cursor computes legacy or extended
194 /// grapheme cluster boundaries (enables GB9a and GB9b if set).
195 is_extended: bool,
196 /// Information about the potential boundary at `offset`
197 state: GraphemeState,
198 /// Category of codepoint immediately preceding cursor, if known.
199 cat_before: Option<GraphemeCat>,
200 /// Category of codepoint immediately after cursor, if known.
201 cat_after: Option<GraphemeCat>,
202 /// If set, at least one more codepoint immediately preceding this offset
203 /// is needed to resolve whether there's a boundary at `offset`.
204 pre_context_offset: Option<usize>,
205 /// The number of `InCB=Linker` codepoints preceding `offset`
206 /// (potentially intermingled with `InCB=Extend`).
207 incb_linker_count: Option<usize>,
208 /// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
209 /// is set, then counts the number of RIS between that and `offset`, otherwise
210 /// is an accurate count relative to the string.
211 ris_count: Option<usize>,
212 /// Set if a call to `prev_boundary` or `next_boundary` was suspended due
213 /// to needing more input.
214 resuming: bool,
215 /// Cached grapheme category and associated scalar value range.
216 grapheme_cat_cache: (u32, u32, GraphemeCat),
217}
218
219/// An error return indicating that not enough content was available in the
220/// provided chunk to satisfy the query, and that more content must be provided.
221#[derive(PartialEq, Eq, Debug)]
222pub enum GraphemeIncomplete {
223 /// More pre-context is needed. The caller should call `provide_context`
224 /// with a chunk ending at the offset given, then retry the query. This
225 /// will only be returned if the `chunk_start` parameter is nonzero.
226 PreContext(usize),
227
228 /// When requesting `prev_boundary`, the cursor is moving past the beginning
229 /// of the current chunk, so the chunk before that is requested. This will
230 /// only be returned if the `chunk_start` parameter is nonzero.
231 PrevChunk,
232
233 /// When requesting `next_boundary`, the cursor is moving past the end of the
234 /// current chunk, so the chunk after that is requested. This will only be
235 /// returned if the chunk ends before the `len` parameter provided on
236 /// creation of the cursor.
237 NextChunk, // requesting chunk following the one given
238
239 /// An error returned when the chunk given does not contain the cursor position.
240 InvalidOffset,
241}
242
243// An enum describing the result from lookup of a pair of categories.
244#[derive(PartialEq, Eq)]
245enum PairResult {
246 /// definitely not a break
247 NotBreak,
248 /// definitely a break
249 Break,
250 /// a break iff not in extended mode
251 Extended,
252 /// a break unless in extended mode and preceded by
253 /// a sequence of 0 or more InCB=Extend and one or more
254 /// InCB = Linker (in any order),
255 /// preceded by another InCB=Consonant
256 InCbConsonant,
257 /// a break if preceded by an even number of RIS
258 Regional,
259 /// a break if preceded by emoji base and (Extend)*
260 Emoji,
261}
262
263#[inline]
264fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
265 use self::PairResult::*;
266 use crate::tables::grapheme::GraphemeCat::*;
267 match (before, after) {
268 (GC_CR, GC_LF) => NotBreak, // GB3
269 (GC_Control | GC_CR | GC_LF, _) => Break, // GB4
270 (_, GC_Control | GC_CR | GC_LF) => Break, // GB5
271 (GC_L, GC_L | GC_V | GC_LV | GC_LVT) => NotBreak, // GB6
272 (GC_LV | GC_V, GC_V | GC_T) => NotBreak, // GB7
273 (GC_LVT | GC_T, GC_T) => NotBreak, // GB8
274 (_, GC_Extend | GC_ZWJ) => NotBreak, // GB9
275 (_, GC_SpacingMark) => Extended, // GB9a
276 (GC_Prepend, _) => Extended, // GB9b
277 (_, GC_InCB_Consonant) => InCbConsonant, // GB9c
278 (GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
279 (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
280 (_, _) => Break, // GB999
281 }
282}
283
284impl GraphemeCursor {
285 /// Create a new cursor. The string and initial offset are given at creation
286 /// time, but the contents of the string are not. The `is_extended` parameter
287 /// controls whether extended grapheme clusters are selected.
288 ///
289 /// The `offset` parameter must be on a codepoint boundary.
290 ///
291 /// ```rust
292 /// # use unicode_segmentation::GraphemeCursor;
293 /// let s = "हिन्दी";
294 /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
295 /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
296 /// let mut extended = GraphemeCursor::new(0, s.len(), true);
297 /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
298 /// ```
299 pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
300 let state = if offset == 0 || offset == len {
301 GraphemeState::Break
302 } else {
303 GraphemeState::Unknown
304 };
305 GraphemeCursor {
306 offset,
307 len,
308 state,
309 is_extended,
310 cat_before: None,
311 cat_after: None,
312 pre_context_offset: None,
313 incb_linker_count: None,
314 ris_count: None,
315 resuming: false,
316 grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
317 }
318 }
319
320 fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
321 use crate::tables::grapheme as gr;
322 use crate::tables::grapheme::GraphemeCat::*;
323
324 if ch <= '\u{7e}' {
325 // Special-case optimization for ascii, except U+007F. This
326 // improves performance even for many primarily non-ascii texts,
327 // due to use of punctuation and white space characters from the
328 // ascii range.
329 if ch >= '\u{20}' {
330 GC_Any
331 } else if ch == '\n' {
332 GC_LF
333 } else if ch == '\r' {
334 GC_CR
335 } else {
336 GC_Control
337 }
338 } else {
339 // If this char isn't within the cached range, update the cache to the
340 // range that includes it.
341 if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
342 self.grapheme_cat_cache = gr::grapheme_category(ch);
343 }
344 self.grapheme_cat_cache.2
345 }
346 }
347
348 // Not sure I'm gonna keep this, the advantage over new() seems thin.
349
350 /// Set the cursor to a new location in the same string.
351 ///
352 /// ```rust
353 /// # use unicode_segmentation::GraphemeCursor;
354 /// let s = "abcd";
355 /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
356 /// assert_eq!(cursor.cur_cursor(), 0);
357 /// cursor.set_cursor(2);
358 /// assert_eq!(cursor.cur_cursor(), 2);
359 /// ```
360 pub fn set_cursor(&mut self, offset: usize) {
361 if offset != self.offset {
362 self.offset = offset;
363 self.state = if offset == 0 || offset == self.len {
364 GraphemeState::Break
365 } else {
366 GraphemeState::Unknown
367 };
368 // reset state derived from text around cursor
369 self.cat_before = None;
370 self.cat_after = None;
371 self.incb_linker_count = None;
372 self.ris_count = None;
373 }
374 }
375
376 #[inline]
377 /// The current offset of the cursor. Equal to the last value provided to
378 /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
379 /// `prev_boundary()`.
380 ///
381 /// ```rust
382 /// # use unicode_segmentation::GraphemeCursor;
383 /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
384 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
385 /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
386 /// assert_eq!(cursor.cur_cursor(), 4);
387 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
388 /// assert_eq!(cursor.cur_cursor(), 8);
389 /// ```
390 pub fn cur_cursor(&self) -> usize {
391 self.offset
392 }
393
394 /// Provide additional pre-context when it is needed to decide a boundary.
395 /// The end of the chunk must coincide with the value given in the
396 /// `GraphemeIncomplete::PreContext` request.
397 ///
398 /// ```rust
399 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
400 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
401 /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
402 /// // Not enough pre-context to decide if there's a boundary between the two flags.
403 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
404 /// // Provide one more Regional Indicator Symbol of pre-context
405 /// cursor.provide_context(&flags[4..8], 4);
406 /// // Still not enough context to decide.
407 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
408 /// // Provide additional requested context.
409 /// cursor.provide_context(&flags[0..4], 0);
410 /// // That's enough to decide (it always is when context goes to the start of the string)
411 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
412 /// ```
413 pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
414 use crate::tables::grapheme as gr;
415 assert!(chunk_start.saturating_add(chunk.len()) == self.pre_context_offset.unwrap());
416 self.pre_context_offset = None;
417 if self.is_extended && chunk_start + chunk.len() == self.offset {
418 let ch = chunk.chars().next_back().unwrap();
419 if self.grapheme_category(ch) == gr::GC_Prepend {
420 self.decide(false); // GB9b
421 return;
422 }
423 }
424 match self.state {
425 GraphemeState::InCbConsonant => self.handle_incb_consonant(chunk, chunk_start),
426 GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
427 GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
428 _ => {
429 if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
430 let ch = chunk.chars().next_back().unwrap();
431 self.cat_before = Some(self.grapheme_category(ch));
432 }
433 }
434 }
435 }
436
437 #[inline]
438 fn decide(&mut self, is_break: bool) {
439 self.state = if is_break {
440 GraphemeState::Break
441 } else {
442 GraphemeState::NotBreak
443 };
444 }
445
446 #[inline]
447 fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
448 self.decide(is_break);
449 Ok(is_break)
450 }
451
452 #[inline]
453 fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
454 if self.state == GraphemeState::Break {
455 Ok(true)
456 } else if self.state == GraphemeState::NotBreak {
457 Ok(false)
458 } else if let Some(pre_context_offset) = self.pre_context_offset {
459 Err(GraphemeIncomplete::PreContext(pre_context_offset))
460 } else {
461 unreachable!("inconsistent state");
462 }
463 }
464
465 /// For handling rule GB9c:
466 ///
467 /// There's an `InCB=Consonant` after this, and we need to look back
468 /// to verify whether there should be a break.
469 ///
470 /// Seek backward to find an `InCB=Linker` preceded by an `InCB=Consonsnt`
471 /// (potentially separated by some number of `InCB=Linker` or `InCB=Extend`).
472 /// If we find the consonant in question, then there's no break; if we find a consonant
473 /// with no linker, or a non-linker non-extend non-consonant, or the start of text, there's a break;
474 /// otherwise we need more context
475 #[inline]
476 fn handle_incb_consonant(&mut self, chunk: &str, chunk_start: usize) {
477 use crate::tables::{self, grapheme as gr};
478
479 // GB9c only applies to extended grapheme clusters
480 if !self.is_extended {
481 self.decide(true);
482 return;
483 }
484
485 let mut incb_linker_count = self.incb_linker_count.unwrap_or(0);
486
487 for ch in chunk.chars().rev() {
488 if tables::is_incb_linker(ch) {
489 // We found an InCB linker
490 incb_linker_count += 1;
491 self.incb_linker_count = Some(incb_linker_count);
492 } else if tables::derived_property::InCB_Extend(ch) {
493 // We ignore InCB extends, continue
494 } else {
495 // Prev character is neither linker nor extend, break suppressed iff it's InCB=Consonant
496 let result = !(self.incb_linker_count.unwrap_or(0) > 0
497 && self.grapheme_category(ch) == gr::GC_InCB_Consonant);
498 self.decide(result);
499 return;
500 }
501 }
502
503 if chunk_start == 0 {
504 // Start of text and we still haven't found a consonant, so break
505 self.decide(true);
506 } else {
507 // We need more context
508 self.pre_context_offset = Some(chunk_start);
509 self.state = GraphemeState::InCbConsonant;
510 }
511 }
512
513 #[inline]
514 fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
515 use crate::tables::grapheme as gr;
516 let mut ris_count = self.ris_count.unwrap_or(0);
517 for ch in chunk.chars().rev() {
518 if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
519 self.ris_count = Some(ris_count);
520 self.decide((ris_count % 2) == 0);
521 return;
522 }
523 ris_count += 1;
524 }
525 self.ris_count = Some(ris_count);
526 if chunk_start == 0 {
527 self.decide((ris_count % 2) == 0);
528 } else {
529 self.pre_context_offset = Some(chunk_start);
530 self.state = GraphemeState::Regional;
531 }
532 }
533
534 #[inline]
535 fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
536 use crate::tables::grapheme as gr;
537 let mut iter = chunk.chars().rev();
538 if let Some(ch) = iter.next() {
539 if self.grapheme_category(ch) != gr::GC_ZWJ {
540 self.decide(true);
541 return;
542 }
543 }
544 for ch in iter {
545 match self.grapheme_category(ch) {
546 gr::GC_Extend => (),
547 gr::GC_Extended_Pictographic => {
548 self.decide(false);
549 return;
550 }
551 _ => {
552 self.decide(true);
553 return;
554 }
555 }
556 }
557 if chunk_start == 0 {
558 self.decide(true);
559 } else {
560 self.pre_context_offset = Some(chunk_start);
561 self.state = GraphemeState::Emoji;
562 }
563 }
564
565 #[inline]
566 /// Determine whether the current cursor location is a grapheme cluster boundary.
567 /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
568 /// the length of `chunk` is not equal to `len` on creation, then this method
569 /// may return `GraphemeIncomplete::PreContext`. The caller should then
570 /// call `provide_context` with the requested chunk, then retry calling this
571 /// method.
572 ///
573 /// For partial chunks, if the cursor is not at the beginning or end of the
574 /// string, the chunk should contain at least the codepoint following the cursor.
575 /// If the string is nonempty, the chunk must be nonempty.
576 ///
577 /// All calls should have consistent chunk contents (ie, if a chunk provides
578 /// content for a given slice, all further chunks covering that slice must have
579 /// the same content for it).
580 ///
581 /// ```rust
582 /// # use unicode_segmentation::GraphemeCursor;
583 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
584 /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
585 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
586 /// cursor.set_cursor(12);
587 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
588 /// ```
589 pub fn is_boundary(
590 &mut self,
591 chunk: &str,
592 chunk_start: usize,
593 ) -> Result<bool, GraphemeIncomplete> {
594 use crate::tables::grapheme as gr;
595 if self.state == GraphemeState::Break {
596 return Ok(true);
597 }
598 if self.state == GraphemeState::NotBreak {
599 return Ok(false);
600 }
601 if (self.offset < chunk_start || self.offset >= chunk_start.saturating_add(chunk.len()))
602 && (self.offset > chunk_start.saturating_add(chunk.len()) || self.cat_after.is_none())
603 {
604 return Err(GraphemeIncomplete::InvalidOffset);
605 }
606 if let Some(pre_context_offset) = self.pre_context_offset {
607 return Err(GraphemeIncomplete::PreContext(pre_context_offset));
608 }
609 let offset_in_chunk = self.offset.saturating_sub(chunk_start);
610 if self.cat_after.is_none() {
611 let ch = chunk[offset_in_chunk..].chars().next().unwrap();
612 self.cat_after = Some(self.grapheme_category(ch));
613 }
614 if self.offset == chunk_start {
615 let mut need_pre_context = true;
616 match self.cat_after.unwrap() {
617 gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant,
618 gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
619 gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
620 _ => need_pre_context = self.cat_before.is_none(),
621 }
622 if need_pre_context {
623 self.pre_context_offset = Some(chunk_start);
624 return Err(GraphemeIncomplete::PreContext(chunk_start));
625 }
626 }
627 if self.cat_before.is_none() {
628 let ch = chunk[..offset_in_chunk].chars().next_back().unwrap();
629 self.cat_before = Some(self.grapheme_category(ch));
630 }
631 match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
632 PairResult::NotBreak => self.decision(false),
633 PairResult::Break => self.decision(true),
634 PairResult::Extended => {
635 let is_extended = self.is_extended;
636 self.decision(!is_extended)
637 }
638 PairResult::InCbConsonant => {
639 self.handle_incb_consonant(&chunk[..offset_in_chunk], chunk_start);
640 self.is_boundary_result()
641 }
642 PairResult::Regional => {
643 if let Some(ris_count) = self.ris_count {
644 return self.decision((ris_count % 2) == 0);
645 }
646 self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
647 self.is_boundary_result()
648 }
649 PairResult::Emoji => {
650 self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
651 self.is_boundary_result()
652 }
653 }
654 }
655
656 #[inline]
657 /// Find the next boundary after the current cursor position. Only a part of
658 /// the string need be supplied. If the chunk is incomplete, then this
659 /// method might return `GraphemeIncomplete::PreContext` or
660 /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
661 /// call `provide_context` with the requested chunk, then retry. In the
662 /// latter case, the caller should provide the chunk following the one
663 /// given, then retry.
664 ///
665 /// See `is_boundary` for expectations on the provided chunk.
666 ///
667 /// ```rust
668 /// # use unicode_segmentation::GraphemeCursor;
669 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
670 /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
671 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
672 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
673 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
674 /// ```
675 ///
676 /// And an example that uses partial strings:
677 ///
678 /// ```rust
679 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
680 /// let s = "abcd";
681 /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
682 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
683 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
684 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
685 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
686 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
687 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
688 /// ```
689 pub fn next_boundary(
690 &mut self,
691 chunk: &str,
692 chunk_start: usize,
693 ) -> Result<Option<usize>, GraphemeIncomplete> {
694 if self.offset == self.len {
695 return Ok(None);
696 }
697 let mut iter = chunk[self.offset.saturating_sub(chunk_start)..].chars();
698 let mut ch = match iter.next() {
699 Some(ch) => ch,
700 None => return Err(GraphemeIncomplete::NextChunk),
701 };
702 loop {
703 if self.resuming {
704 if self.cat_after.is_none() {
705 self.cat_after = Some(self.grapheme_category(ch));
706 }
707 } else {
708 self.offset = self.offset.saturating_add(ch.len_utf8());
709 self.state = GraphemeState::Unknown;
710 self.cat_before = self.cat_after.take();
711 if self.cat_before.is_none() {
712 self.cat_before = Some(self.grapheme_category(ch));
713 }
714 if crate::tables::is_incb_linker(ch) {
715 self.incb_linker_count = Some(self.incb_linker_count.map_or(1, |c| c + 1));
716 } else if !crate::tables::derived_property::InCB_Extend(ch) {
717 self.incb_linker_count = Some(0);
718 }
719 if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
720 self.ris_count = self.ris_count.map(|c| c + 1);
721 } else {
722 self.ris_count = Some(0);
723 }
724 if let Some(next_ch) = iter.next() {
725 ch = next_ch;
726 self.cat_after = Some(self.grapheme_category(ch));
727 } else if self.offset == self.len {
728 self.decide(true);
729 } else {
730 self.resuming = true;
731 return Err(GraphemeIncomplete::NextChunk);
732 }
733 }
734 self.resuming = true;
735 if self.is_boundary(chunk, chunk_start)? {
736 self.resuming = false;
737 return Ok(Some(self.offset));
738 }
739 self.resuming = false;
740 }
741 }
742
743 /// Find the previous boundary after the current cursor position. Only a part
744 /// of the string need be supplied. If the chunk is incomplete, then this
745 /// method might return `GraphemeIncomplete::PreContext` or
746 /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
747 /// call `provide_context` with the requested chunk, then retry. In the
748 /// latter case, the caller should provide the chunk preceding the one
749 /// given, then retry.
750 ///
751 /// See `is_boundary` for expectations on the provided chunk.
752 ///
753 /// ```rust
754 /// # use unicode_segmentation::GraphemeCursor;
755 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
756 /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
757 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
758 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
759 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
760 /// ```
761 ///
762 /// And an example that uses partial strings (note the exact return is not
763 /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
764 ///
765 /// ```rust
766 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
767 /// let s = "abcd";
768 /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
769 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
770 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
771 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
772 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
773 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
774 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
775 /// ```
776 pub fn prev_boundary(
777 &mut self,
778 chunk: &str,
779 chunk_start: usize,
780 ) -> Result<Option<usize>, GraphemeIncomplete> {
781 if self.offset == 0 {
782 return Ok(None);
783 }
784 if self.offset == chunk_start {
785 return Err(GraphemeIncomplete::PrevChunk);
786 }
787 let mut iter = chunk[..self.offset.saturating_sub(chunk_start)]
788 .chars()
789 .rev();
790 let mut ch = iter.next().unwrap();
791 loop {
792 if self.offset == chunk_start {
793 self.resuming = true;
794 return Err(GraphemeIncomplete::PrevChunk);
795 }
796 if self.resuming {
797 self.cat_before = Some(self.grapheme_category(ch));
798 } else {
799 self.offset -= ch.len_utf8();
800 self.cat_after = self.cat_before.take();
801 self.state = GraphemeState::Unknown;
802 if let Some(incb_linker_count) = self.incb_linker_count {
803 self.ris_count = if incb_linker_count > 0 && crate::tables::is_incb_linker(ch) {
804 Some(incb_linker_count - 1)
805 } else if crate::tables::derived_property::InCB_Extend(ch) {
806 Some(incb_linker_count)
807 } else {
808 None
809 };
810 }
811 if let Some(ris_count) = self.ris_count {
812 self.ris_count = if ris_count > 0 {
813 Some(ris_count - 1)
814 } else {
815 None
816 };
817 }
818 if let Some(prev_ch) = iter.next() {
819 ch = prev_ch;
820 self.cat_before = Some(self.grapheme_category(ch));
821 } else if self.offset == 0 {
822 self.decide(true);
823 } else {
824 self.resuming = true;
825 self.cat_after = Some(self.grapheme_category(ch));
826 return Err(GraphemeIncomplete::PrevChunk);
827 }
828 }
829 self.resuming = true;
830 if self.is_boundary(chunk, chunk_start)? {
831 self.resuming = false;
832 return Ok(Some(self.offset));
833 }
834 self.resuming = false;
835 }
836 }
837}
838
839#[test]
840fn test_grapheme_cursor_ris_precontext() {
841 let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
842 let mut c = GraphemeCursor::new(8, s.len(), true);
843 assert_eq!(
844 c.is_boundary(&s[4..], 4),
845 Err(GraphemeIncomplete::PreContext(4))
846 );
847 c.provide_context(&s[..4], 0);
848 assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
849}
850
851#[test]
852fn test_grapheme_cursor_chunk_start_require_precontext() {
853 let s = "\r\n";
854 let mut c = GraphemeCursor::new(1, s.len(), true);
855 assert_eq!(
856 c.is_boundary(&s[1..], 1),
857 Err(GraphemeIncomplete::PreContext(1))
858 );
859 c.provide_context(&s[..1], 0);
860 assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
861}
862
863#[test]
864fn test_grapheme_cursor_prev_boundary() {
865 let s = "abcd";
866 let mut c = GraphemeCursor::new(3, s.len(), true);
867 assert_eq!(
868 c.prev_boundary(&s[2..], 2),
869 Err(GraphemeIncomplete::PrevChunk)
870 );
871 assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
872}
873
874#[test]
875fn test_grapheme_cursor_prev_boundary_chunk_start() {
876 let s = "abcd";
877 let mut c = GraphemeCursor::new(2, s.len(), true);
878 assert_eq!(
879 c.prev_boundary(&s[2..], 2),
880 Err(GraphemeIncomplete::PrevChunk)
881 );
882 assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
883}