unicode_segmentation/lib.rs
1// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
12//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
13//!
14//! ```rust
15//! extern crate unicode_segmentation;
16//!
17//! use unicode_segmentation::UnicodeSegmentation;
18//!
19//! fn main() {
20//! let s = "a̐éö̲\r\n";
21//! let g = UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>();
22//! let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
23//! assert_eq!(g, b);
24//!
25//! let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
26//! let w = s.unicode_words().collect::<Vec<&str>>();
27//! let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
28//! assert_eq!(w, b);
29//!
30//! let s = "The quick (\"brown\") fox";
31//! let w = s.split_word_bounds().collect::<Vec<&str>>();
32//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
33//! assert_eq!(w, b);
34//! }
35//! ```
36//!
37//! # no_std
38//!
39//! unicode-segmentation does not depend on libstd, so it can be used in crates
40//! with the `#![no_std]` attribute.
41//!
42//! # crates.io
43//!
44//! You can use this package in your project by adding the following
45//! to your `Cargo.toml`:
46//!
47//! ```toml
48//! [dependencies]
49//! unicode-segmentation = "1.9.0"
50//! ```
51
52#![deny(missing_docs, unsafe_code)]
53#![doc(
54 html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
55 html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
56)]
57#![no_std]
58
59pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
60pub use grapheme::{GraphemeIndices, Graphemes};
61pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
62pub use tables::UNICODE_VERSION;
63pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
64
65mod grapheme;
66mod sentence;
67#[rustfmt::skip]
68mod tables;
69mod word;
70
71/// Methods for segmenting strings according to
72/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
73pub trait UnicodeSegmentation {
74 /// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
75 ///
76 /// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
77 ///
78 /// If `is_extended` is true, the iterator is over the
79 /// *extended grapheme clusters*;
80 /// otherwise, the iterator is over the *legacy grapheme clusters*.
81 /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
82 /// recommends extended grapheme cluster boundaries for general processing.
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// # use self::unicode_segmentation::UnicodeSegmentation;
88 /// let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true)
89 /// .collect::<Vec<&str>>();
90 /// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"];
91 ///
92 /// assert_eq!(&gr1[..], b);
93 ///
94 /// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
95 /// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
96 ///
97 /// assert_eq!(&gr2[..], b);
98 /// ```
99 fn graphemes(&self, is_extended: bool) -> Graphemes<'_>;
100
101 /// Returns an iterator over the grapheme clusters of `self` and their
102 /// byte offsets. See `graphemes()` for more information.
103 ///
104 /// # Examples
105 ///
106 /// ```
107 /// # use self::unicode_segmentation::UnicodeSegmentation;
108 /// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true)
109 /// .collect::<Vec<(usize, &str)>>();
110 /// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
111 ///
112 /// assert_eq!(&gr_inds[..], b);
113 /// ```
114 fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices<'_>;
115
116 /// Returns an iterator over the words of `self`, separated on
117 /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
118 ///
119 /// Here, "words" are just those substrings which, after splitting on
120 /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
121 /// substring must contain at least one character with the
122 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
123 /// property, or with
124 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
125 ///
126 /// # Example
127 ///
128 /// ```
129 /// # use self::unicode_segmentation::UnicodeSegmentation;
130 /// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
131 /// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
132 /// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
133 ///
134 /// assert_eq!(&uw1[..], b);
135 /// ```
136 fn unicode_words(&self) -> UnicodeWords<'_>;
137
138 /// Returns an iterator over the words of `self`, separated on
139 /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
140 /// offsets.
141 ///
142 /// Here, "words" are just those substrings which, after splitting on
143 /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
144 /// substring must contain at least one character with the
145 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
146 /// property, or with
147 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
148 ///
149 /// # Example
150 ///
151 /// ```
152 /// # use self::unicode_segmentation::UnicodeSegmentation;
153 /// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
154 /// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
155 /// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
156 /// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
157 ///
158 /// assert_eq!(&uwi1[..], b);
159 /// ```
160 fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>;
161
162 /// Returns an iterator over substrings of `self` separated on
163 /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
164 ///
165 /// The concatenation of the substrings returned by this function is just the original string.
166 ///
167 /// # Example
168 ///
169 /// ```
170 /// # use self::unicode_segmentation::UnicodeSegmentation;
171 /// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
172 /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
173 ///
174 /// assert_eq!(&swu1[..], b);
175 /// ```
176 fn split_word_bounds(&self) -> UWordBounds<'_>;
177
178 /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
179 /// and their offsets. See `split_word_bounds()` for more information.
180 ///
181 /// # Example
182 ///
183 /// ```
184 /// # use self::unicode_segmentation::UnicodeSegmentation;
185 /// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
186 /// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
187 /// (14, "°"), (16, "F"), (17, "!")];
188 ///
189 /// assert_eq!(&swi1[..], b);
190 /// ```
191 fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>;
192
193 /// Returns an iterator over substrings of `self` separated on
194 /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
195 ///
196 /// Here, "sentences" are just those substrings which, after splitting on
197 /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
198 /// substring must contain at least one character with the
199 /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
200 /// property, or with
201 /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
202 ///
203 /// # Example
204 ///
205 /// ```
206 /// # use self::unicode_segmentation::UnicodeSegmentation;
207 /// let uss = "Mr. Fox jumped. [...] The dog was too lazy.";
208 /// let us1 = uss.unicode_sentences().collect::<Vec<&str>>();
209 /// let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."];
210 ///
211 /// assert_eq!(&us1[..], b);
212 /// ```
213 fn unicode_sentences(&self) -> UnicodeSentences<'_>;
214
215 /// Returns an iterator over substrings of `self` separated on
216 /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
217 ///
218 /// The concatenation of the substrings returned by this function is just the original string.
219 ///
220 /// # Example
221 ///
222 /// ```
223 /// # use self::unicode_segmentation::UnicodeSegmentation;
224 /// let ssbs = "Mr. Fox jumped. [...] The dog was too lazy.";
225 /// let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>();
226 /// let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."];
227 ///
228 /// assert_eq!(&ssb1[..], b);
229 /// ```
230 fn split_sentence_bounds(&self) -> USentenceBounds<'_>;
231
232 /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
233 /// and their offsets. See `split_sentence_bounds()` for more information.
234 ///
235 /// # Example
236 ///
237 /// ```
238 /// # use self::unicode_segmentation::UnicodeSegmentation;
239 /// let ssis = "Mr. Fox jumped. [...] The dog was too lazy.";
240 /// let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>();
241 /// let b: &[_] = &[(0, "Mr. "), (4, "Fox jumped. "), (16, "[...] "),
242 /// (22, "The dog was too lazy.")];
243 ///
244 /// assert_eq!(&ssi1[..], b);
245 /// ```
246 fn split_sentence_bound_indices(&self) -> USentenceBoundIndices<'_>;
247}
248
249impl UnicodeSegmentation for str {
250 #[inline]
251 fn graphemes(&self, is_extended: bool) -> Graphemes {
252 grapheme::new_graphemes(self, is_extended)
253 }
254
255 #[inline]
256 fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
257 grapheme::new_grapheme_indices(self, is_extended)
258 }
259
260 #[inline]
261 fn unicode_words(&self) -> UnicodeWords {
262 word::new_unicode_words(self)
263 }
264
265 #[inline]
266 fn unicode_word_indices(&self) -> UnicodeWordIndices {
267 word::new_unicode_word_indices(self)
268 }
269
270 #[inline]
271 fn split_word_bounds(&self) -> UWordBounds {
272 word::new_word_bounds(self)
273 }
274
275 #[inline]
276 fn split_word_bound_indices(&self) -> UWordBoundIndices {
277 word::new_word_bound_indices(self)
278 }
279
280 #[inline]
281 fn unicode_sentences(&self) -> UnicodeSentences {
282 sentence::new_unicode_sentences(self)
283 }
284
285 #[inline]
286 fn split_sentence_bounds(&self) -> USentenceBounds {
287 sentence::new_sentence_bounds(self)
288 }
289
290 #[inline]
291 fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
292 sentence::new_sentence_bound_indices(self)
293 }
294}