intaglio/bytes.rs
1//! Intern arbitrary bytes.
2//!
3//! This module provides a nearly identical API to the one found in the
4//! top-level of this crate. There is one important difference:
5//!
6//! 1. Interned contents are `&[u8]` instead of `&str`. Additionally, `Vec<u8>`
7//! is used where `String` would have been used.
8//!
9//! # Example: intern byte string
10//!
11//! ```
12//! # use intaglio::bytes::SymbolTable;
13//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
14//! let mut table = SymbolTable::new();
15//! let sym = table.intern(&b"abc"[..])?;
16//! assert_eq!(sym, table.intern(b"abc".to_vec())?);
17//! assert_eq!(Some(&b"abc"[..]), table.get(sym));
18//! # Ok(())
19//! # }
20//! # example().unwrap();
21//! ```
22//!
23//! # Example: symbol iterators
24//!
25//! ```
26//! # use std::collections::HashMap;
27//! # use intaglio::bytes::SymbolTable;
28//! # use intaglio::Symbol;
29//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
30//! let mut table = SymbolTable::new();
31//! let sym = table.intern(&b"abc"[..])?;
32//! // Retrieve set of `Symbol`s.
33//! let all_symbols = table.all_symbols();
34//! assert_eq!(vec![sym], all_symbols.collect::<Vec<_>>());
35//!
36//! table.intern(&b"xyz"[..])?;
37//! let mut map = HashMap::new();
38//! map.insert(Symbol::new(0), &b"abc"[..]);
39//! map.insert(Symbol::new(1), &b"xyz"[..]);
40//! // Retrieve symbol to byte content mappings.
41//! let iter = table.iter();
42//! assert_eq!(map, iter.collect::<HashMap<_, _>>());
43//! # Ok(())
44//! # }
45//! # example().unwrap();
46//! ```
47//!
48//! # Performance
49//!
50//! In general, one should expect this crate's performance on `&[u8]` to be
51//! roughly similar to performance on `&str`.
52
53use core::hash::BuildHasher;
54use core::iter::{FromIterator, FusedIterator, Zip};
55use core::marker::PhantomData;
56use core::mem::ManuallyDrop;
57use core::ops::Range;
58use core::slice;
59use std::borrow::Cow;
60use std::collections::hash_map::{HashMap, RandomState};
61
62use crate::internal::Interned;
63use crate::{Symbol, SymbolOverflowError, DEFAULT_SYMBOL_TABLE_CAPACITY};
64
65/// An iterator over all [`Symbol`]s in a [`SymbolTable`].
66///
67/// See the [`all_symbols`](SymbolTable::all_symbols) method in [`SymbolTable`].
68///
69/// # Usage
70///
71/// ```
72/// # use intaglio::bytes::SymbolTable;
73/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
74/// let mut table = SymbolTable::new();
75/// let sym = table.intern(&b"abc"[..])?;
76/// let all_symbols = table.all_symbols();
77/// assert_eq!(vec![sym], all_symbols.collect::<Vec<_>>());
78/// # Ok(())
79/// # }
80/// # example().unwrap();
81/// ```
82#[derive(Debug, Clone, Hash, PartialEq, Eq)]
83#[cfg_attr(docsrs, doc(cfg(feature = "bytes")))]
84pub struct AllSymbols<'a> {
85 range: Range<usize>,
86 // Hold a shared reference to the underlying `SymbolTable` to ensure the
87 // table is not modified while we are iterating which would make the results
88 // not match the real state.
89 phantom: PhantomData<&'a SymbolTable>,
90}
91
92impl Iterator for AllSymbols<'_> {
93 type Item = Symbol;
94
95 fn next(&mut self) -> Option<Self::Item> {
96 let next = self.range.next()?;
97 debug_assert!(u32::try_from(next).is_ok());
98 Some((next as u32).into())
99 }
100
101 fn size_hint(&self) -> (usize, Option<usize>) {
102 self.range.size_hint()
103 }
104
105 fn count(self) -> usize {
106 self.range.count()
107 }
108
109 fn last(self) -> Option<Self::Item> {
110 let last = self.range.last()?;
111 debug_assert!(u32::try_from(last).is_ok());
112 Some((last as u32).into())
113 }
114
115 fn nth(&mut self, n: usize) -> Option<Self::Item> {
116 let nth = self.range.nth(n)?;
117 debug_assert!(u32::try_from(nth).is_ok());
118 Some((nth as u32).into())
119 }
120
121 fn collect<B: FromIterator<Self::Item>>(self) -> B {
122 self.range.map(|sym| Symbol::from(sym as u32)).collect()
123 }
124}
125
126impl DoubleEndedIterator for AllSymbols<'_> {
127 fn next_back(&mut self) -> Option<Self::Item> {
128 let next = self.range.next_back()?;
129 debug_assert!(u32::try_from(next).is_ok());
130 Some((next as u32).into())
131 }
132
133 fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
134 let nth = self.range.nth_back(n)?;
135 debug_assert!(u32::try_from(nth).is_ok());
136 Some((nth as u32).into())
137 }
138}
139
140impl FusedIterator for AllSymbols<'_> {}
141
142/// An iterator over all interned byte strings in a [`SymbolTable`].
143///
144/// See the [`bytestrings`](SymbolTable::bytestrings) method in [`SymbolTable`].
145///
146/// # Usage
147///
148/// ```
149/// # use intaglio::bytes::SymbolTable;
150/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
151/// let mut table = SymbolTable::new();
152/// let sym = table.intern(b"abc".to_vec())?;
153/// let bytestrings = table.bytestrings();
154/// assert_eq!(vec![&b"abc"[..]], bytestrings.collect::<Vec<_>>());
155/// # Ok(())
156/// # }
157/// # example().unwrap();
158/// ```
159#[derive(Debug, Clone)]
160#[cfg_attr(docsrs, doc(cfg(feature = "bytes")))]
161pub struct Bytestrings<'a>(slice::Iter<'a, Interned<[u8]>>);
162
163impl<'a> Iterator for Bytestrings<'a> {
164 type Item = &'a [u8];
165
166 fn next(&mut self) -> Option<Self::Item> {
167 self.0.next().map(Interned::as_slice)
168 }
169
170 fn size_hint(&self) -> (usize, Option<usize>) {
171 self.0.size_hint()
172 }
173
174 fn count(self) -> usize {
175 self.0.count()
176 }
177
178 fn last(self) -> Option<Self::Item> {
179 self.0.last().map(Interned::as_slice)
180 }
181
182 fn nth(&mut self, n: usize) -> Option<Self::Item> {
183 self.0.nth(n).map(Interned::as_slice)
184 }
185
186 fn collect<B: FromIterator<Self::Item>>(self) -> B {
187 self.0.map(Interned::as_slice).collect()
188 }
189}
190
191impl DoubleEndedIterator for Bytestrings<'_> {
192 fn next_back(&mut self) -> Option<Self::Item> {
193 self.0.next_back().map(Interned::as_slice)
194 }
195
196 fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
197 self.0.nth_back(n).map(Interned::as_slice)
198 }
199
200 fn rfold<B, F>(self, accum: B, f: F) -> B
201 where
202 F: FnMut(B, Self::Item) -> B,
203 {
204 self.0.map(Interned::as_slice).rfold(accum, f)
205 }
206}
207
208impl ExactSizeIterator for Bytestrings<'_> {
209 fn len(&self) -> usize {
210 self.0.len()
211 }
212}
213
214impl FusedIterator for Bytestrings<'_> {}
215
216/// An iterator over all symbols and interned byte strings in a [`SymbolTable`].
217///
218/// See the [`iter`](SymbolTable::iter) method in [`SymbolTable`].
219///
220/// # Usage
221///
222/// ```
223/// # use std::collections::HashMap;
224/// # use intaglio::bytes::SymbolTable;
225/// # use intaglio::Symbol;
226/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
227/// let mut table = SymbolTable::new();
228/// let sym = table.intern(b"abc".to_vec())?;
229/// let iter = table.iter();
230/// let mut map = HashMap::new();
231/// map.insert(Symbol::new(0), &b"abc"[..]);
232/// assert_eq!(map, iter.collect::<HashMap<_, _>>());
233/// # Ok(())
234/// # }
235/// # example().unwrap();
236/// ```
237#[derive(Debug, Clone)]
238#[cfg_attr(docsrs, doc(cfg(feature = "bytes")))]
239pub struct Iter<'a>(Zip<AllSymbols<'a>, Bytestrings<'a>>);
240
241impl<'a> Iterator for Iter<'a> {
242 type Item = (Symbol, &'a [u8]);
243
244 fn next(&mut self) -> Option<Self::Item> {
245 self.0.next()
246 }
247
248 fn size_hint(&self) -> (usize, Option<usize>) {
249 self.0.size_hint()
250 }
251
252 fn count(self) -> usize {
253 self.0.count()
254 }
255
256 fn last(self) -> Option<Self::Item> {
257 self.0.last()
258 }
259
260 fn nth(&mut self, n: usize) -> Option<Self::Item> {
261 self.0.nth(n)
262 }
263
264 fn collect<B: FromIterator<Self::Item>>(self) -> B {
265 self.0.collect()
266 }
267}
268
269impl FusedIterator for Iter<'_> {}
270
271impl<'a, S> IntoIterator for &'a SymbolTable<S> {
272 type Item = (Symbol, &'a [u8]);
273 type IntoIter = Iter<'a>;
274
275 fn into_iter(self) -> Self::IntoIter {
276 self.iter()
277 }
278}
279
280/// Byte string interner.
281///
282/// This symbol table is implemented by storing byte strings with a fast path for
283/// `&[u8]` that are already `'static`.
284///
285/// See module documentation for more.
286///
287/// # Usage
288///
289/// ```
290/// # use intaglio::bytes::SymbolTable;
291/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
292/// let mut table = SymbolTable::new();
293/// let sym = table.intern(&b"abc"[..])?;
294/// assert_eq!(sym, table.intern(b"abc".to_vec())?);
295/// assert!(table.contains(sym));
296/// assert!(table.is_interned(b"abc"));
297/// # Ok(())
298/// # }
299/// # example().unwrap();
300/// ```
301#[derive(Default, Debug)]
302#[cfg_attr(docsrs, doc(cfg(feature = "bytes")))]
303pub struct SymbolTable<S = RandomState> {
304 map: ManuallyDrop<HashMap<&'static [u8], Symbol, S>>,
305 vec: ManuallyDrop<Vec<Interned<[u8]>>>,
306}
307
308impl<S> Drop for SymbolTable<S> {
309 fn drop(&mut self) {
310 // SAFETY: No mutable references to `SymbolTable` internal fields are
311 // given out, which means `ManuallyDrop::drop` can only be invoked in
312 // this `Drop::drop` impl. Interal fields are guaranteed to be
313 // initialized by `SymbolTable` constructors.
314 unsafe {
315 // `Interned` requires that the `'static` references it gives out
316 // are dropped before the owning buffer stored in the `Interned`.
317 ManuallyDrop::drop(&mut self.map);
318 ManuallyDrop::drop(&mut self.vec);
319 }
320 }
321}
322
323impl SymbolTable<RandomState> {
324 /// Constructs a new, empty `SymbolTable` with [default capacity].
325 ///
326 /// This function will always allocate. To construct a symbol table without
327 /// allocating, call [`SymbolTable::with_capacity(0)`].
328 ///
329 /// # Examples
330 ///
331 /// ```
332 /// # use intaglio::bytes::SymbolTable;
333 /// let table = SymbolTable::new();
334 /// assert_eq!(0, table.len());
335 /// assert!(table.capacity() >= 4096);
336 /// ```
337 ///
338 /// [default capacity]: DEFAULT_SYMBOL_TABLE_CAPACITY
339 /// [`SymbolTable::with_capacity(0)`]: Self::with_capacity
340 #[must_use]
341 pub fn new() -> Self {
342 Self::with_capacity(DEFAULT_SYMBOL_TABLE_CAPACITY)
343 }
344
345 /// Constructs a new, empty `SymbolTable` with the specified capacity.
346 ///
347 /// The symbol table will be able to hold at least `capacity` byte strings
348 /// without reallocating. If `capacity` is 0, the symbol table will not
349 /// allocate.
350 ///
351 /// # Examples
352 ///
353 /// ```
354 /// # use intaglio::bytes::SymbolTable;
355 /// let table = SymbolTable::with_capacity(10);
356 /// assert_eq!(0, table.len());
357 /// assert!(table.capacity() >= 10);
358 /// ```
359 #[must_use]
360 pub fn with_capacity(capacity: usize) -> Self {
361 let capacity = capacity.next_power_of_two();
362 Self {
363 map: ManuallyDrop::new(HashMap::with_capacity(capacity)),
364 vec: ManuallyDrop::new(Vec::with_capacity(capacity)),
365 }
366 }
367}
368
369impl<S> SymbolTable<S> {
370 /// Constructs a new, empty `SymbolTable` with
371 /// [default capacity](DEFAULT_SYMBOL_TABLE_CAPACITY) and the given hash
372 /// builder.
373 ///
374 /// # Examples
375 ///
376 /// ```
377 /// # use std::collections::hash_map::RandomState;
378 /// # use intaglio::bytes::SymbolTable;
379 /// let hash_builder = RandomState::new();
380 /// let table = SymbolTable::with_hasher(hash_builder);
381 /// assert_eq!(0, table.len());
382 /// assert!(table.capacity() >= 4096);
383 /// ```
384 pub fn with_hasher(hash_builder: S) -> Self {
385 Self::with_capacity_and_hasher(DEFAULT_SYMBOL_TABLE_CAPACITY, hash_builder)
386 }
387
388 /// Constructs a new, empty `SymbolTable` with the specified capacity and
389 /// the given hash builder.
390 ///
391 /// # Examples
392 ///
393 /// ```
394 /// # use std::collections::hash_map::RandomState;
395 /// # use intaglio::bytes::SymbolTable;
396 /// let hash_builder = RandomState::new();
397 /// let table = SymbolTable::with_capacity_and_hasher(10, hash_builder);
398 /// assert_eq!(0, table.len());
399 /// assert!(table.capacity() >= 10);
400 /// ```
401 pub fn with_capacity_and_hasher(capacity: usize, hash_builder: S) -> Self {
402 Self {
403 map: ManuallyDrop::new(HashMap::with_capacity_and_hasher(capacity, hash_builder)),
404 vec: ManuallyDrop::new(Vec::with_capacity(capacity)),
405 }
406 }
407
408 /// Returns the number of byte strings the table can hold without
409 /// reallocating.
410 ///
411 /// # Examples
412 ///
413 /// ```
414 /// # use intaglio::bytes::SymbolTable;
415 /// let table = SymbolTable::with_capacity(10);
416 /// assert!(table.capacity() >= 10);
417 /// ```
418 pub fn capacity(&self) -> usize {
419 usize::min(self.vec.capacity(), self.map.capacity())
420 }
421
422 /// Returns the number of interned byte strings in the table.
423 ///
424 /// # Examples
425 ///
426 /// ```
427 /// # use intaglio::bytes::SymbolTable;
428 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
429 /// let mut table = SymbolTable::new();
430 /// assert_eq!(0, table.len());
431 ///
432 /// table.intern(b"abc".to_vec())?;
433 /// // only uniquely interned byte strings grow the symbol table.
434 /// table.intern(b"abc".to_vec())?;
435 /// table.intern(b"xyz".to_vec())?;
436 /// assert_eq!(2, table.len());
437 /// # Ok(())
438 /// # }
439 /// # example().unwrap();
440 /// ```
441 pub fn len(&self) -> usize {
442 self.vec.len()
443 }
444
445 /// Returns `true` if the symbol table contains no interned byte strings.
446 ///
447 /// # Examples
448 ///
449 /// ```
450 /// # use intaglio::bytes::SymbolTable;
451 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
452 /// let mut table = SymbolTable::new();
453 /// assert!(table.is_empty());
454 ///
455 /// table.intern(b"abc".to_vec())?;
456 /// assert!(!table.is_empty());
457 /// # Ok(())
458 /// # }
459 /// # example().unwrap();
460 /// ```
461 pub fn is_empty(&self) -> bool {
462 self.vec.is_empty()
463 }
464
465 /// Returns `true` if the symbol table contains the given symbol.
466 ///
467 /// # Examples
468 ///
469 /// ```
470 /// # use intaglio::bytes::SymbolTable;
471 /// # use intaglio::Symbol;
472 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
473 /// let mut table = SymbolTable::new();
474 /// assert!(!table.contains(Symbol::new(0)));
475 ///
476 /// let sym = table.intern(b"abc".to_vec())?;
477 /// assert!(table.contains(Symbol::new(0)));
478 /// assert!(table.contains(sym));
479 /// # Ok(())
480 /// # }
481 /// # example().unwrap();
482 /// ```
483 #[must_use]
484 pub fn contains(&self, id: Symbol) -> bool {
485 self.get(id).is_some()
486 }
487
488 /// Returns a reference to the byte string associated with the given symbol.
489 ///
490 /// If the given symbol does not exist in the underlying symbol table,
491 /// `None` is returned.
492 ///
493 /// The lifetime of the returned reference is bound to the symbol table.
494 ///
495 /// # Examples
496 ///
497 /// ```
498 /// # use intaglio::bytes::SymbolTable;
499 /// # use intaglio::Symbol;
500 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
501 /// let mut table = SymbolTable::new();
502 /// assert!(table.get(Symbol::new(0)).is_none());
503 ///
504 /// let sym = table.intern(b"abc".to_vec())?;
505 /// assert_eq!(Some(&b"abc"[..]), table.get(Symbol::new(0)));
506 /// assert_eq!(Some(&b"abc"[..]), table.get(sym));
507 /// # Ok(())
508 /// # }
509 /// # example().unwrap();
510 /// ```
511 #[must_use]
512 pub fn get(&self, id: Symbol) -> Option<&[u8]> {
513 let bytes = self.vec.get(usize::from(id))?;
514 Some(bytes.as_slice())
515 }
516
517 /// Returns an iterator over all [`Symbol`]s and byte strings in the
518 /// [`SymbolTable`].
519 ///
520 /// # Examples
521 ///
522 /// ```
523 /// # use std::collections::HashMap;
524 /// # use intaglio::bytes::SymbolTable;
525 /// # use intaglio::Symbol;
526 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
527 /// let mut table = SymbolTable::new();
528 /// table.intern(b"abc".to_vec())?;
529 /// table.intern(b"xyz".to_vec())?;
530 /// table.intern(b"123".to_vec())?;
531 /// table.intern(b"789".to_vec())?;
532 ///
533 /// let iter = table.iter();
534 /// let mut map = HashMap::new();
535 /// map.insert(Symbol::new(0), &b"abc"[..]);
536 /// map.insert(Symbol::new(1), &b"xyz"[..]);
537 /// map.insert(Symbol::new(2), &b"123"[..]);
538 /// map.insert(Symbol::new(3), &b"789"[..]);
539 /// assert_eq!(map, iter.collect::<HashMap<_, _>>());
540 /// # Ok(())
541 /// # }
542 /// # example().unwrap();
543 /// ```
544 ///
545 /// ```
546 /// # use intaglio::bytes::SymbolTable;
547 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
548 /// let mut table = SymbolTable::new();
549 /// table.intern(b"abc".to_vec())?;
550 /// table.intern(b"xyz".to_vec())?;
551 /// table.intern(b"123".to_vec())?;
552 /// table.intern(b"789".to_vec())?;
553 ///
554 /// let iter = table.iter();
555 /// assert_eq!(table.len(), iter.count());
556 /// # Ok(())
557 /// # }
558 /// # example().unwrap();
559 /// ```
560 pub fn iter(&self) -> Iter<'_> {
561 Iter(self.all_symbols().zip(self.bytestrings()))
562 }
563
564 /// Returns an iterator over all [`Symbol`]s in the [`SymbolTable`].
565 ///
566 /// # Examples
567 ///
568 /// ```
569 /// # use intaglio::bytes::SymbolTable;
570 /// # use intaglio::Symbol;
571 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
572 /// let mut table = SymbolTable::new();
573 /// table.intern(b"abc".to_vec())?;
574 /// table.intern(b"xyz".to_vec())?;
575 /// table.intern(b"123".to_vec())?;
576 /// table.intern(b"789".to_vec())?;
577 ///
578 /// let mut all_symbols = table.all_symbols();
579 /// assert_eq!(Some(Symbol::new(0)), all_symbols.next());
580 /// assert_eq!(Some(Symbol::new(1)), all_symbols.nth_back(2));
581 /// assert_eq!(None, all_symbols.next());
582 /// # Ok(())
583 /// # }
584 /// # example().unwrap();
585 /// ```
586 ///
587 /// ```
588 /// # use intaglio::bytes::SymbolTable;
589 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
590 /// let mut table = SymbolTable::new();
591 /// table.intern(b"abc".to_vec())?;
592 /// table.intern(b"xyz".to_vec())?;
593 /// table.intern(b"123".to_vec())?;
594 /// table.intern(b"789".to_vec())?;
595 ///
596 /// let all_symbols = table.all_symbols();
597 /// assert_eq!(table.len(), all_symbols.count());
598 /// # Ok(())
599 /// # }
600 /// # example().unwrap();
601 /// ```
602 pub fn all_symbols(&self) -> AllSymbols<'_> {
603 AllSymbols {
604 range: 0..self.len(),
605 phantom: PhantomData,
606 }
607 }
608
609 /// Returns an iterator over all byte strings in the [`SymbolTable`].
610 ///
611 /// # Examples
612 ///
613 /// ```
614 /// # use intaglio::bytes::SymbolTable;
615 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
616 /// let mut table = SymbolTable::new();
617 /// table.intern(b"abc".to_vec())?;
618 /// table.intern(b"xyz".to_vec())?;
619 /// table.intern(b"123".to_vec())?;
620 /// table.intern(b"789".to_vec())?;
621 ///
622 /// let mut bytestrings = table.bytestrings();
623 /// assert_eq!(Some(&b"abc"[..]), bytestrings.next());
624 /// assert_eq!(Some(&b"xyz"[..]), bytestrings.nth_back(2));
625 /// assert_eq!(None, bytestrings.next());
626 /// # Ok(())
627 /// # }
628 /// # example().unwrap();
629 /// ```
630 ///
631 /// ```
632 /// # use intaglio::bytes::SymbolTable;
633 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
634 /// let mut table = SymbolTable::new();
635 /// table.intern(b"abc".to_vec())?;
636 /// table.intern(b"xyz".to_vec())?;
637 /// table.intern(b"123".to_vec())?;
638 /// table.intern(b"789".to_vec())?;
639 ///
640 /// let bytestrings = table.bytestrings();
641 /// assert_eq!(table.len(), bytestrings.count());
642 /// # Ok(())
643 /// # }
644 /// # example().unwrap();
645 /// ```
646 pub fn bytestrings(&self) -> Bytestrings<'_> {
647 Bytestrings(self.vec.iter())
648 }
649}
650
651impl<S> SymbolTable<S>
652where
653 S: BuildHasher,
654{
655 /// Intern a byte string for the lifetime of the symbol table.
656 ///
657 /// The returned `Symbol` allows retrieving of the underlying bytes.
658 /// Equal byte strings will be inserted into the symbol table exactly once.
659 ///
660 /// This function only allocates if the underlying symbol table has no
661 /// remaining capacity.
662 ///
663 /// # Errors
664 ///
665 /// If the symbol table would grow larger than `u32::MAX` interned
666 /// byte strings, the [`Symbol`] counter would overflow and a
667 /// [`SymbolOverflowError`] is returned.
668 ///
669 /// # Examples
670 ///
671 /// ```
672 /// # use intaglio::bytes::SymbolTable;
673 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
674 /// let mut table = SymbolTable::new();
675 /// let sym = table.intern(b"abc".to_vec())?;
676 /// table.intern(b"xyz".to_vec())?;
677 /// table.intern(&b"123"[..])?;
678 /// table.intern(&b"789"[..])?;
679 ///
680 /// assert_eq!(4, table.len());
681 /// assert_eq!(Some(&b"abc"[..]), table.get(sym));
682 /// # Ok(())
683 /// # }
684 /// # example().unwrap();
685 /// ```
686 pub fn intern<T>(&mut self, contents: T) -> Result<Symbol, SymbolOverflowError>
687 where
688 T: Into<Cow<'static, [u8]>>,
689 {
690 let contents = contents.into();
691 if let Some(&id) = self.map.get(&*contents) {
692 return Ok(id);
693 }
694
695 // The `Interned::Owned` variant is derived from a `Box<T>`. When such
696 // a structure is moved or assigned, as it is below in the call to
697 // `self.vec.push`, the allocation is "retagged" in Miri/stacked borrows.
698 //
699 // Retagging an allocation pops all of the borrows derived from it off
700 // of the stack. This means we need to move the `Interned` into the
701 // `Vec` before calling `Interned::as_static_slice` to ensure the
702 // reference does not get invalidated by retagging.
703 //
704 // However, that alone may be insufficient as the `Interened` may be
705 // moved when the symbol table grows.
706 //
707 // The `SymbolTable` API prevents shared references to the `Interned`
708 // being invalidated by a retag by tying resolved symbol contents,
709 // `&'a T`, to `&'a SymbolTable`, which means the `SymbolTable` cannot
710 // grow, shrink, or otherwise reallocate/move contents while a reference
711 // to the `Interned`'s inner `T` is alive.
712 //
713 // To protect against future updates to stacked borrows or the unsafe
714 // code operational semantics, we can address this additional invariant
715 // with updated `Interned` internals which store the `Box<T>` in a raw
716 // pointer form, which allows moves to be treated as untyped copies.
717 //
718 // See:
719 //
720 // - <https://github.com/artichoke/intaglio/issues/235>
721 // - <https://github.com/artichoke/intaglio/pull/236>
722 let name = Interned::from(contents);
723 let id = Symbol::try_from(self.map.len())?;
724
725 // Move the `Interned` into the `Vec`, causing it to be retagged under
726 // stacked borrows, before taking any references to its inner `T`.
727 self.vec.push(name);
728 // Ensure we grow the map before we take any shared references to the
729 // inner `T`.
730 self.map.reserve(1);
731
732 // SAFETY: `self.vec` is non-empty because the preceding line of code
733 // pushed an entry into it.
734 let name = unsafe { self.vec.last().unwrap_unchecked() };
735
736 // SAFETY: This expression creates a reference with a `'static` lifetime
737 // from an owned and interned buffer, which is permissible because:
738 //
739 // - `Interned` is an internal implementation detail of `SymbolTable`.
740 // - `SymbolTable` never gives out `'static` references to underlying
741 // contents.
742 // - All slice references given out by the `SymbolTable` have the same
743 // lifetime as the `SymbolTable`.
744 // - The `map` field of `SymbolTable`, which contains the `'static`
745 // references, is dropped before the owned buffers stored in this
746 // `Interned`.
747 // - The shared reference may be derived from a `PinBox` which prevents
748 // moves from retagging the underlying boxed `T` under stacked borrows.
749 // - The symbol table cannot grow, shrink, or otherwise move its contents
750 // while this reference is alive.
751 let slice = unsafe { name.as_static_slice() };
752
753 self.map.insert(slice, id);
754
755 debug_assert_eq!(self.get(id), Some(slice));
756 debug_assert_eq!(self.intern(slice), Ok(id));
757
758 Ok(id)
759 }
760
761 /// Returns the `Symbol` identifier for `contents` if it has been interned
762 /// before, `None` otherwise.
763 ///
764 /// This method does not modify the symbol table.
765 ///
766 /// # Examples
767 ///
768 /// ```
769 /// # use intaglio::bytes::SymbolTable;
770 /// # use intaglio::Symbol;
771 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
772 /// let mut table = SymbolTable::new();
773 /// assert!(!table.is_interned(b"abc"));
774 /// assert_eq!(None, table.check_interned(b"abc"));
775 ///
776 /// table.intern(b"abc".to_vec())?;
777 /// assert!(table.is_interned(b"abc"));
778 /// assert_eq!(Some(Symbol::new(0)), table.check_interned(b"abc"));
779 /// # Ok(())
780 /// # }
781 /// # example().unwrap();
782 /// ```
783 #[must_use]
784 pub fn check_interned(&self, contents: &[u8]) -> Option<Symbol> {
785 self.map.get(contents).copied()
786 }
787
788 /// Returns `true` if the given byte string has been interned before.
789 ///
790 /// This method does not modify the symbol table.
791 ///
792 /// # Examples
793 ///
794 /// ```
795 /// # use intaglio::bytes::SymbolTable;
796 /// # use intaglio::Symbol;
797 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
798 /// let mut table = SymbolTable::new();
799 /// assert!(!table.is_interned(b"abc"));
800 /// assert_eq!(None, table.check_interned(b"abc"));
801 ///
802 /// table.intern(b"abc".to_vec())?;
803 /// assert!(table.is_interned(b"abc"));
804 /// assert_eq!(Some(Symbol::new(0)), table.check_interned(b"abc"));
805 /// # Ok(())
806 /// # }
807 /// # example().unwrap();
808 /// ```
809 #[must_use]
810 pub fn is_interned(&self, contents: &[u8]) -> bool {
811 self.map.contains_key(contents)
812 }
813
814 /// Reserves capacity for at least additional more elements to be inserted
815 /// in the given `SymbolTable`. The collection may reserve more space to
816 /// avoid frequent reallocations. After calling reserve, capacity will be
817 /// greater than or equal to `self.len() + additional`. Does nothing if
818 /// capacity is already sufficient.
819 ///
820 /// # Panics
821 ///
822 /// Panics if the new capacity overflows `usize`.
823 ///
824 /// # Examples
825 ///
826 /// ```
827 /// # use intaglio::bytes::SymbolTable;
828 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
829 /// let mut table = SymbolTable::with_capacity(1);
830 /// table.intern(b"abc".to_vec())?;
831 /// table.reserve(10);
832 /// assert!(table.capacity() >= 11);
833 /// # Ok(())
834 /// # }
835 /// # example().unwrap();
836 /// ```
837 pub fn reserve(&mut self, additional: usize) {
838 self.map.reserve(additional);
839 self.vec.reserve(additional);
840 }
841
842 /// Shrinks the capacity of the symbol table as much as possible.
843 ///
844 /// It will drop down as close as possible to the length but the allocator
845 /// may still inform the symbol table that there is space for a few more
846 /// elements.
847 ///
848 /// # Examples
849 ///
850 /// ```
851 /// # use intaglio::bytes::SymbolTable;
852 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
853 /// let mut table = SymbolTable::with_capacity(10);
854 /// table.intern(b"abc".to_vec());
855 /// table.intern(b"xyz".to_vec());
856 /// table.intern(b"123".to_vec());
857 /// table.shrink_to_fit();
858 /// assert!(table.capacity() >= 3);
859 /// # Ok(())
860 /// # }
861 /// # example().unwrap();
862 /// ```
863 pub fn shrink_to_fit(&mut self) {
864 self.map.shrink_to_fit();
865 self.vec.shrink_to_fit();
866 }
867
868 /// Shrinks the capacity of the symbol table with a lower bound.
869 ///
870 /// The capacity will remain at least as large as both the length and the
871 /// supplied value.
872 ///
873 /// If the current capacity is less than the lower limit, this is a no-op.
874 ///
875 /// # Examples
876 ///
877 /// ```
878 /// # use intaglio::bytes::SymbolTable;
879 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
880 /// let mut table = SymbolTable::with_capacity(10);
881 /// table.intern(b"abc".to_vec())?;
882 /// table.intern(b"xyz".to_vec())?;
883 /// table.intern(b"123".to_vec())?;
884 /// table.shrink_to(5);
885 /// assert!(table.capacity() >= 5);
886 /// # Ok(())
887 /// # }
888 /// # example().unwrap();
889 /// ```
890 pub fn shrink_to(&mut self, min_capacity: usize) {
891 self.map.shrink_to(min_capacity);
892 self.vec.shrink_to(min_capacity);
893 }
894}
895
896#[cfg(test)]
897#[allow(clippy::needless_pass_by_value)]
898mod tests {
899 use quickcheck::quickcheck;
900
901 use super::SymbolTable;
902
903 #[test]
904 fn alloc_drop_new() {
905 let table = SymbolTable::new();
906 drop(table);
907 }
908
909 #[test]
910 fn alloc_drop_with_capacity() {
911 let table = SymbolTable::with_capacity(1 << 14);
912 drop(table);
913 }
914
915 #[test]
916 fn drop_with_true_static_data() {
917 let mut table = SymbolTable::new();
918 table.intern(&b"1"[..]).unwrap();
919 table.intern(&b"2"[..]).unwrap();
920 table.intern(&b"3"[..]).unwrap();
921 table.intern(&b"4"[..]).unwrap();
922 table.intern(&b"5"[..]).unwrap();
923 drop(table);
924 }
925
926 #[test]
927 fn drop_with_owned_data() {
928 let mut table = SymbolTable::new();
929 table.intern(b"1".to_vec()).unwrap();
930 table.intern(b"2".to_vec()).unwrap();
931 table.intern(b"3".to_vec()).unwrap();
932 table.intern(b"4".to_vec()).unwrap();
933 table.intern(b"5".to_vec()).unwrap();
934 drop(table);
935 }
936
937 #[test]
938 fn set_owned_value_and_get_with_owned_and_borrowed() {
939 let mut table = SymbolTable::new();
940 // intern an owned value
941 let sym = table.intern(b"abc".to_vec()).unwrap();
942 // retrieve bytes
943 assert_eq!(&b"abc"[..], table.get(sym).unwrap());
944 // intern owned value again
945 assert_eq!(sym, table.intern(b"abc".to_vec()).unwrap());
946 // intern borrowed value
947 assert_eq!(sym, table.intern(&b"abc"[..]).unwrap());
948 }
949
950 #[test]
951 fn set_borrowed_value_and_get_with_owned_and_borrowed() {
952 let mut table = SymbolTable::new();
953 // intern a borrowed value
954 let sym = table.intern(&b"abc"[..]).unwrap();
955 // retrieve bytes
956 assert_eq!(&b"abc"[..], table.get(sym).unwrap());
957 // intern owned value
958 assert_eq!(sym, table.intern(b"abc".to_vec()).unwrap());
959 // intern borrowed value again
960 assert_eq!(sym, table.intern(&b"abc"[..]).unwrap());
961 }
962
963 quickcheck! {
964 fn intern_twice_symbol_equality(bytes: Vec<u8>) -> bool {
965 let mut table = SymbolTable::new();
966 let sym = table.intern(bytes.clone()).unwrap();
967 let sym_again = table.intern(bytes).unwrap();
968 sym == sym_again
969 }
970
971 fn intern_get_roundtrip(bytes: Vec<u8>) -> bool {
972 let mut table = SymbolTable::new();
973 let sym = table.intern(bytes.clone()).unwrap();
974 let retrieved_bytes = table.get(sym).unwrap();
975 bytes == retrieved_bytes
976 }
977
978 fn table_contains_sym(bytes: Vec<u8>) -> bool {
979 let mut table = SymbolTable::new();
980 let sym = table.intern(bytes).unwrap();
981 table.contains(sym)
982 }
983
984 fn table_does_not_contain_missing_symbol_ids(sym: u32) -> bool {
985 let table = SymbolTable::new();
986 !table.contains(sym.into())
987 }
988
989 fn empty_table_does_not_report_any_interned_byte_strings(bytes: Vec<u8>) -> bool {
990 let table = SymbolTable::new();
991 !table.is_interned(bytes.as_slice())
992 }
993
994 fn table_reports_interned_byte_strings_as_interned(bytes: Vec<u8>) -> bool {
995 let mut table = SymbolTable::new();
996 table.intern(bytes.clone()).unwrap();
997 table.is_interned(bytes.as_slice())
998 }
999 }
1000}