unicode_segmentation/word.rs
1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14use crate::tables::word::WordCat;
15
16/// An iterator over the substrings of a string which, after splitting the string on
17/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18/// contain any characters with the
19/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20/// property, or with
21/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22///
23/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24/// its documentation for more.
25///
26/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28#[derive(Debug)]
29pub struct UnicodeWords<'a> {
30 inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
31}
32
33impl<'a> Iterator for UnicodeWords<'a> {
34 type Item = &'a str;
35
36 #[inline]
37 fn next(&mut self) -> Option<&'a str> {
38 self.inner.next()
39 }
40
41 #[inline]
42 fn size_hint(&self) -> (usize, Option<usize>) {
43 self.inner.size_hint()
44 }
45}
46impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
47 #[inline]
48 fn next_back(&mut self) -> Option<&'a str> {
49 self.inner.next_back()
50 }
51}
52
53/// An iterator over the substrings of a string which, after splitting the string on
54/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
55/// contain any characters with the
56/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
57/// property, or with
58/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
59/// This iterator also provides the byte offsets for each substring.
60///
61/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
62/// its documentation for more.
63///
64/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
65/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
66#[derive(Debug)]
67pub struct UnicodeWordIndices<'a> {
68 #[allow(clippy::type_complexity)]
69 inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
70}
71
72impl<'a> Iterator for UnicodeWordIndices<'a> {
73 type Item = (usize, &'a str);
74
75 #[inline]
76 fn next(&mut self) -> Option<(usize, &'a str)> {
77 self.inner.next()
78 }
79
80 #[inline]
81 fn size_hint(&self) -> (usize, Option<usize>) {
82 self.inner.size_hint()
83 }
84}
85impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
86 #[inline]
87 fn next_back(&mut self) -> Option<(usize, &'a str)> {
88 self.inner.next_back()
89 }
90}
91
92/// External iterator for a string's
93/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
94///
95/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
96/// trait. See its documentation for more.
97///
98/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
99/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
100#[derive(Debug, Clone)]
101pub struct UWordBounds<'a> {
102 string: &'a str,
103 cat: Option<WordCat>,
104 catb: Option<WordCat>,
105}
106
107/// External iterator for word boundaries and byte offsets.
108///
109/// This struct is created by the [`split_word_bound_indices`] method on the
110/// [`UnicodeSegmentation`] trait. See its documentation for more.
111///
112/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
113/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
114#[derive(Debug, Clone)]
115pub struct UWordBoundIndices<'a> {
116 start_offset: usize,
117 iter: UWordBounds<'a>,
118}
119
120impl<'a> UWordBoundIndices<'a> {
121 #[inline]
122 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
123 ///
124 /// ```rust
125 /// # use unicode_segmentation::UnicodeSegmentation;
126 /// let mut iter = "Hello world".split_word_bound_indices();
127 /// assert_eq!(iter.as_str(), "Hello world");
128 /// iter.next();
129 /// assert_eq!(iter.as_str(), " world");
130 /// iter.next();
131 /// assert_eq!(iter.as_str(), "world");
132 /// ```
133 pub fn as_str(&self) -> &'a str {
134 self.iter.as_str()
135 }
136}
137
138impl<'a> Iterator for UWordBoundIndices<'a> {
139 type Item = (usize, &'a str);
140
141 #[inline]
142 fn next(&mut self) -> Option<(usize, &'a str)> {
143 self.iter
144 .next()
145 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
146 }
147
148 #[inline]
149 fn size_hint(&self) -> (usize, Option<usize>) {
150 self.iter.size_hint()
151 }
152}
153
154impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
155 #[inline]
156 fn next_back(&mut self) -> Option<(usize, &'a str)> {
157 self.iter
158 .next_back()
159 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
160 }
161}
162
163// state machine for word boundary rules
164#[derive(Clone, Copy, PartialEq, Eq, Debug)]
165enum UWordBoundsState {
166 Start,
167 Letter,
168 HLetter,
169 Numeric,
170 Katakana,
171 ExtendNumLet,
172 Regional(RegionalState),
173 FormatExtend(FormatExtendType),
174 Zwj,
175 Emoji,
176 WSegSpace,
177}
178
179// subtypes for FormatExtend state in UWordBoundsState
180#[derive(Clone, Copy, PartialEq, Eq, Debug)]
181enum FormatExtendType {
182 AcceptAny,
183 AcceptNone,
184 RequireLetter,
185 RequireHLetter,
186 AcceptQLetter,
187 RequireNumeric,
188}
189
190#[derive(Clone, Copy, PartialEq, Eq, Debug)]
191enum RegionalState {
192 Half,
193 Full,
194 Unknown,
195}
196
197fn is_emoji(ch: char) -> bool {
198 use crate::tables::emoji;
199 emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
200}
201
202impl<'a> Iterator for UWordBounds<'a> {
203 type Item = &'a str;
204
205 #[inline]
206 fn size_hint(&self) -> (usize, Option<usize>) {
207 let slen = self.string.len();
208 (cmp::min(slen, 1), Some(slen))
209 }
210
211 #[inline]
212 fn next(&mut self) -> Option<&'a str> {
213 use self::FormatExtendType::*;
214 use self::UWordBoundsState::*;
215 use crate::tables::word as wd;
216 if self.string.is_empty() {
217 return None;
218 }
219
220 let mut take_curr = true;
221 let mut take_cat = true;
222 let mut idx = 0;
223 let mut saveidx = 0;
224 let mut state = Start;
225 let mut cat = wd::WC_Any;
226 let mut savecat = wd::WC_Any;
227
228 // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
229 let mut skipped_format_extend = false;
230 for (curr, ch) in self.string.char_indices() {
231 idx = curr;
232 // Whether or not the previous category was ZWJ
233 // ZWJs get collapsed, so this handles precedence of WB3c over WB4
234 let prev_zwj = cat == wd::WC_ZWJ;
235 // if there's a category cached, grab it
236 cat = match self.cat {
237 None => wd::word_category(ch).2,
238 _ => self.cat.take().unwrap(),
239 };
240 take_cat = true;
241
242 // handle rule WB4
243 // just skip all format, extend, and zwj chars
244 // note that Start is a special case: if there's a bunch of Format | Extend
245 // characters at the beginning of a block of text, dump them out as one unit.
246 //
247 // (This is not obvious from the wording of UAX#29, but if you look at the
248 // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
249 // then the "correct" interpretation of WB4 becomes apparent.)
250 if state != Start {
251 match cat {
252 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
253 skipped_format_extend = true;
254 continue;
255 }
256 _ => {}
257 }
258 }
259
260 // rule WB3c
261 // WB4 makes all ZWJs collapse into the previous state
262 // but you can still be in a Zwj state if you started with Zwj
263 //
264 // This means that an EP + Zwj will collapse into EP, which is wrong,
265 // since EP+EP is not a boundary but EP+ZWJ+EP is
266 //
267 // Thus, we separately keep track of whether or not the last character
268 // was a ZWJ. This is an additional bit of state tracked outside of the
269 // state enum; the state enum represents the last non-zwj state encountered.
270 // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
271 // however we are in the previous state for the purposes of all other rules.
272 if prev_zwj && is_emoji(ch) {
273 state = Emoji;
274 continue;
275 }
276 // Don't use `continue` in this match without updating `cat`
277 state = match state {
278 Start if cat == wd::WC_CR => {
279 idx += match self.get_next_cat(idx) {
280 Some(wd::WC_LF) => 1, // rule WB3
281 _ => 0,
282 };
283 break; // rule WB3a
284 }
285 Start => match cat {
286 wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
287 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
288 wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
289 wd::WC_Katakana => Katakana, // rule WB13, WB13a
290 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
291 wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
292 wd::WC_LF | wd::WC_Newline => break, // rule WB3a
293 wd::WC_ZWJ => Zwj, // rule WB3c
294 wd::WC_WSegSpace => WSegSpace, // rule WB3d
295 _ => {
296 if let Some(ncat) = self.get_next_cat(idx) {
297 // rule WB4
298 if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
299 {
300 state = FormatExtend(AcceptNone);
301 self.cat = Some(ncat);
302 continue;
303 }
304 }
305 break; // rule WB999
306 }
307 },
308 WSegSpace => match cat {
309 wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
310 _ => {
311 take_curr = false;
312 break;
313 }
314 },
315 Zwj => {
316 // We already handle WB3c above.
317 take_curr = false;
318 break;
319 }
320 Letter | HLetter => match cat {
321 wd::WC_ALetter => Letter, // rule WB5
322 wd::WC_Hebrew_Letter => HLetter, // rule WB5
323 wd::WC_Numeric => Numeric, // rule WB9
324 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
325 wd::WC_Double_Quote if state == HLetter => {
326 savecat = cat;
327 saveidx = idx;
328 FormatExtend(RequireHLetter) // rule WB7b
329 }
330 wd::WC_Single_Quote if state == HLetter => {
331 FormatExtend(AcceptQLetter) // rule WB7a
332 }
333 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
334 savecat = cat;
335 saveidx = idx;
336 FormatExtend(RequireLetter) // rule WB6
337 }
338 _ => {
339 take_curr = false;
340 break;
341 }
342 },
343 Numeric => match cat {
344 wd::WC_Numeric => Numeric, // rule WB8
345 wd::WC_ALetter => Letter, // rule WB10
346 wd::WC_Hebrew_Letter => HLetter, // rule WB10
347 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
348 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
349 savecat = cat;
350 saveidx = idx;
351 FormatExtend(RequireNumeric) // rule WB12
352 }
353 _ => {
354 take_curr = false;
355 break;
356 }
357 },
358 Katakana => match cat {
359 wd::WC_Katakana => Katakana, // rule WB13
360 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
361 _ => {
362 take_curr = false;
363 break;
364 }
365 },
366 ExtendNumLet => match cat {
367 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
368 wd::WC_ALetter => Letter, // rule WB13b
369 wd::WC_Hebrew_Letter => HLetter, // rule WB13b
370 wd::WC_Numeric => Numeric, // rule WB13b
371 wd::WC_Katakana => Katakana, // rule WB13b
372 _ => {
373 take_curr = false;
374 break;
375 }
376 },
377 Regional(RegionalState::Full) => {
378 // if it reaches here we've gone too far,
379 // a full flag can only compose with ZWJ/Extend/Format
380 // proceeding it.
381 take_curr = false;
382 break;
383 }
384 Regional(RegionalState::Half) => match cat {
385 wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
386 _ => {
387 take_curr = false;
388 break;
389 }
390 },
391 Regional(_) => {
392 unreachable!("RegionalState::Unknown should not occur on forward iteration")
393 }
394 Emoji => {
395 // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
396 take_curr = false;
397 break;
398 }
399 FormatExtend(t) => match t {
400 // handle FormatExtends depending on what type
401 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
402 RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
403 RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
404 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
405 AcceptNone | AcceptQLetter => {
406 take_curr = false; // emit all the Format|Extend characters
407 take_cat = false;
408 break;
409 }
410 _ => break, // rewind (in if statement below)
411 },
412 }
413 }
414
415 if let FormatExtend(t) = state {
416 // we were looking for something and didn't find it; we have to back up
417 if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
418 idx = saveidx;
419 cat = savecat;
420 take_curr = false;
421 }
422 }
423
424 self.cat = if take_curr {
425 idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
426 None
427 } else if take_cat {
428 Some(cat)
429 } else {
430 None
431 };
432
433 let retstr = &self.string[..idx];
434 self.string = &self.string[idx..];
435 Some(retstr)
436 }
437}
438
439impl<'a> DoubleEndedIterator for UWordBounds<'a> {
440 #[inline]
441 fn next_back(&mut self) -> Option<&'a str> {
442 use self::FormatExtendType::*;
443 use self::UWordBoundsState::*;
444 use crate::tables::word as wd;
445 if self.string.is_empty() {
446 return None;
447 }
448
449 let mut take_curr = true;
450 let mut take_cat = true;
451 let mut idx = self.string.len();
452 idx -= self.string.chars().next_back().unwrap().len_utf8();
453 let mut previdx = idx;
454 let mut saveidx = idx;
455 let mut state = Start;
456 let mut savestate = Start;
457 let mut cat = wd::WC_Any;
458
459 let mut skipped_format_extend = false;
460
461 for (curr, ch) in self.string.char_indices().rev() {
462 previdx = idx;
463 idx = curr;
464
465 // if there's a category cached, grab it
466 cat = match self.catb {
467 None => wd::word_category(ch).2,
468 _ => self.catb.take().unwrap(),
469 };
470 take_cat = true;
471
472 // backward iterator over word boundaries. Mostly the same as the forward
473 // iterator, with two weirdnesses:
474 // (1) If we encounter a single quote in the Start state, we have to check for a
475 // Hebrew Letter immediately before it.
476 // (2) Format and Extend char handling takes some gymnastics.
477
478 if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
479 // WB3c has more priority so we should not
480 // fold in that case
481 if !matches!(state, FormatExtend(_) | Start) {
482 saveidx = previdx;
483 savestate = state;
484 state = FormatExtend(AcceptNone);
485 }
486
487 if state != Start {
488 continue;
489 }
490 } else if state == FormatExtend(AcceptNone) {
491 // finished a scan of some Format|Extend chars, restore previous state
492 state = savestate;
493 previdx = saveidx;
494 take_cat = false;
495 skipped_format_extend = true;
496 }
497
498 // Don't use `continue` in this match without updating `catb`
499 state = match state {
500 Start | FormatExtend(AcceptAny) => match cat {
501 _ if is_emoji(ch) => Zwj,
502 wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
503 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
504 wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
505 wd::WC_Katakana => Katakana, // rule WB13, WB13b
506 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
507 wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
508 // rule WB4:
509 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
510 wd::WC_Single_Quote => {
511 saveidx = idx;
512 FormatExtend(AcceptQLetter) // rule WB7a
513 }
514 wd::WC_WSegSpace => WSegSpace,
515 wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
516 if state == Start {
517 if cat == wd::WC_LF {
518 idx -= match self.get_prev_cat(idx) {
519 Some(wd::WC_CR) => 1, // rule WB3
520 _ => 0,
521 };
522 }
523 } else {
524 take_curr = false;
525 }
526 break; // rule WB3a
527 }
528 _ => break, // rule WB999
529 },
530 Zwj => match cat {
531 // rule WB3c
532 wd::WC_ZWJ => FormatExtend(AcceptAny),
533 _ => {
534 take_curr = false;
535 break;
536 }
537 },
538 WSegSpace => match cat {
539 // rule WB3d
540 wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
541 _ => {
542 take_curr = false;
543 break;
544 }
545 },
546 Letter | HLetter => match cat {
547 wd::WC_ALetter => Letter, // rule WB5
548 wd::WC_Hebrew_Letter => HLetter, // rule WB5
549 wd::WC_Numeric => Numeric, // rule WB10
550 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
551 wd::WC_Double_Quote if state == HLetter => {
552 saveidx = previdx;
553 FormatExtend(RequireHLetter) // rule WB7c
554 }
555 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
556 saveidx = previdx;
557 FormatExtend(RequireLetter) // rule WB7
558 }
559 _ => {
560 take_curr = false;
561 break;
562 }
563 },
564 Numeric => match cat {
565 wd::WC_Numeric => Numeric, // rule WB8
566 wd::WC_ALetter => Letter, // rule WB9
567 wd::WC_Hebrew_Letter => HLetter, // rule WB9
568 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
569 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
570 saveidx = previdx;
571 FormatExtend(RequireNumeric) // rule WB11
572 }
573 _ => {
574 take_curr = false;
575 break;
576 }
577 },
578 Katakana => match cat {
579 wd::WC_Katakana => Katakana, // rule WB13
580 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
581 _ => {
582 take_curr = false;
583 break;
584 }
585 },
586 ExtendNumLet => match cat {
587 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
588 wd::WC_ALetter => Letter, // rule WB13a
589 wd::WC_Hebrew_Letter => HLetter, // rule WB13a
590 wd::WC_Numeric => Numeric, // rule WB13a
591 wd::WC_Katakana => Katakana, // rule WB13a
592 _ => {
593 take_curr = false;
594 break;
595 }
596 },
597 Regional(mut regional_state) => match cat {
598 // rule WB13c
599 wd::WC_Regional_Indicator => {
600 if regional_state == RegionalState::Unknown {
601 let count = self.string[..previdx]
602 .chars()
603 .rev()
604 .map(|c| wd::word_category(c).2)
605 .filter(|&c| {
606 !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
607 })
608 .take_while(|&c| c == wd::WC_Regional_Indicator)
609 .count();
610 regional_state = if count % 2 == 0 {
611 RegionalState::Full
612 } else {
613 RegionalState::Half
614 };
615 }
616 if regional_state == RegionalState::Full {
617 take_curr = false;
618 break;
619 } else {
620 Regional(RegionalState::Full)
621 }
622 }
623 _ => {
624 take_curr = false;
625 break;
626 }
627 },
628 Emoji => {
629 if is_emoji(ch) {
630 // rule WB3c
631 Zwj
632 } else {
633 take_curr = false;
634 break;
635 }
636 }
637 FormatExtend(t) => match t {
638 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
639 RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
640 RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
641 AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
642 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
643 _ => break, // backtrack will happens
644 },
645 }
646 }
647
648 if let FormatExtend(t) = state {
649 // if we required something but didn't find it, backtrack
650 if t == RequireLetter
651 || t == RequireHLetter
652 || t == RequireNumeric
653 || t == AcceptNone
654 || t == AcceptQLetter
655 {
656 previdx = saveidx;
657 take_cat = false;
658 take_curr = false;
659 }
660 }
661
662 self.catb = if take_curr {
663 None
664 } else {
665 idx = previdx;
666 if take_cat {
667 Some(cat)
668 } else {
669 None
670 }
671 };
672
673 let retstr = &self.string[idx..];
674 self.string = &self.string[..idx];
675 Some(retstr)
676 }
677}
678
679impl<'a> UWordBounds<'a> {
680 #[inline]
681 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
682 ///
683 /// ```rust
684 /// # use unicode_segmentation::UnicodeSegmentation;
685 /// let mut iter = "Hello world".split_word_bounds();
686 /// assert_eq!(iter.as_str(), "Hello world");
687 /// iter.next();
688 /// assert_eq!(iter.as_str(), " world");
689 /// iter.next();
690 /// assert_eq!(iter.as_str(), "world");
691 /// ```
692 pub fn as_str(&self) -> &'a str {
693 self.string
694 }
695
696 #[inline]
697 fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
698 use crate::tables::word as wd;
699 let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
700 if nidx < self.string.len() {
701 let nch = self.string[nidx..].chars().next().unwrap();
702 Some(wd::word_category(nch).2)
703 } else {
704 None
705 }
706 }
707
708 #[inline]
709 fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
710 use crate::tables::word as wd;
711 if idx > 0 {
712 let nch = self.string[..idx].chars().next_back().unwrap();
713 Some(wd::word_category(nch).2)
714 } else {
715 None
716 }
717 }
718}
719
720#[inline]
721pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
722 UWordBounds {
723 string: s,
724 cat: None,
725 catb: None,
726 }
727}
728
729#[inline]
730pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
731 UWordBoundIndices {
732 start_offset: s.as_ptr() as usize,
733 iter: new_word_bounds(s),
734 }
735}
736
737#[inline]
738fn has_alphanumeric(s: &&str) -> bool {
739 use crate::tables::util::is_alphanumeric;
740
741 s.chars().any(is_alphanumeric)
742}
743
744#[inline]
745pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
746 use super::UnicodeSegmentation;
747
748 UnicodeWords {
749 inner: s.split_word_bounds().filter(has_alphanumeric),
750 }
751}
752
753#[inline]
754pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
755 use super::UnicodeSegmentation;
756
757 UnicodeWordIndices {
758 inner: s
759 .split_word_bound_indices()
760 .filter(|(_, c)| has_alphanumeric(c)),
761 }
762}
763
764#[cfg(test)]
765mod tests {
766 #[test]
767 fn test_syriac_abbr_mark() {
768 use crate::tables::word as wd;
769 let (_, _, cat) = wd::word_category('\u{70f}');
770 assert_eq!(cat, wd::WC_ALetter);
771 }
772
773 #[test]
774 fn test_end_of_ayah_cat() {
775 use crate::tables::word as wd;
776 let (_, _, cat) = wd::word_category('\u{6dd}');
777 assert_eq!(cat, wd::WC_Numeric);
778 }
779}