#[cfg(target_arch = "x86")]
use std::arch::x86::{
__m128i,
_mm_and_si128,
_mm_cmpeq_epi8,
_mm_cvtsi128_si32,
_mm_loadu_si128,
_mm_sad_epu8,
_mm_set1_epi8,
_mm_setzero_si128,
_mm_shuffle_epi32,
_mm_sub_epi8,
_mm_xor_si128,
};
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::{
__m128i,
_mm_and_si128,
_mm_cmpeq_epi8,
_mm_cvtsi128_si32,
_mm_loadu_si128,
_mm_sad_epu8,
_mm_set1_epi8,
_mm_setzero_si128,
_mm_shuffle_epi32,
_mm_sub_epi8,
_mm_xor_si128,
};
#[target_feature(enable = "sse2")]
pub unsafe fn _mm_set1_epu8(a: u8) -> __m128i {
_mm_set1_epi8(a as i8)
}
#[target_feature(enable = "sse2")]
pub unsafe fn mm_cmpneq_epi8(a: __m128i, b: __m128i) -> __m128i {
_mm_xor_si128(_mm_cmpeq_epi8(a, b), _mm_set1_epi8(-1))
}
const MASK: [u8; 32] = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
];
#[target_feature(enable = "sse2")]
unsafe fn mm_from_offset(slice: &[u8], offset: usize) -> __m128i {
_mm_loadu_si128(slice.as_ptr().offset(offset as isize) as *const _)
}
#[target_feature(enable = "sse2")]
unsafe fn sum(u8s: &__m128i) -> usize {
let sums = _mm_sad_epu8(*u8s, _mm_setzero_si128());
(_mm_cvtsi128_si32(sums) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sums, 0xaa))) as usize
}
#[target_feature(enable = "sse2")]
pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize {
assert!(haystack.len() >= 16);
let mut offset = 0;
let mut count = 0;
let needles = _mm_set1_epu8(needle);
while haystack.len() >= offset + 16 * 255 {
let mut counts = _mm_setzero_si128();
for _ in 0..255 {
counts = _mm_sub_epi8(
counts,
_mm_cmpeq_epi8(mm_from_offset(haystack, offset), needles)
);
offset += 16;
}
count += sum(&counts);
}
if haystack.len() >= offset + 16 * 128 {
let mut counts = _mm_setzero_si128();
for _ in 0..128 {
counts = _mm_sub_epi8(
counts,
_mm_cmpeq_epi8(mm_from_offset(haystack, offset), needles)
);
offset += 16;
}
count += sum(&counts);
}
let mut counts = _mm_setzero_si128();
for i in 0..(haystack.len() - offset) / 16 {
counts = _mm_sub_epi8(
counts,
_mm_cmpeq_epi8(mm_from_offset(haystack, offset + i * 16), needles)
);
}
if haystack.len() % 16 != 0 {
counts = _mm_sub_epi8(
counts,
_mm_and_si128(
_mm_cmpeq_epi8(mm_from_offset(haystack, haystack.len() - 16), needles),
mm_from_offset(&MASK, haystack.len() % 16)
)
);
}
count += sum(&counts);
count
}
#[target_feature(enable = "sse2")]
unsafe fn is_leading_utf8_byte(u8s: __m128i) -> __m128i {
mm_cmpneq_epi8(_mm_and_si128(u8s, _mm_set1_epu8(0b1100_0000)), _mm_set1_epu8(0b1000_0000))
}
#[target_feature(enable = "sse2")]
pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
assert!(utf8_chars.len() >= 16);
let mut offset = 0;
let mut count = 0;
while utf8_chars.len() >= offset + 16 * 255 {
let mut counts = _mm_setzero_si128();
for _ in 0..255 {
counts = _mm_sub_epi8(
counts,
is_leading_utf8_byte(mm_from_offset(utf8_chars, offset))
);
offset += 16;
}
count += sum(&counts);
}
if utf8_chars.len() >= offset + 16 * 128 {
let mut counts = _mm_setzero_si128();
for _ in 0..128 {
counts = _mm_sub_epi8(
counts,
is_leading_utf8_byte(mm_from_offset(utf8_chars, offset))
);
offset += 16;
}
count += sum(&counts);
}
let mut counts = _mm_setzero_si128();
for i in 0..(utf8_chars.len() - offset) / 16 {
counts = _mm_sub_epi8(
counts,
is_leading_utf8_byte(mm_from_offset(utf8_chars, offset + i * 16))
);
}
if utf8_chars.len() % 16 != 0 {
counts = _mm_sub_epi8(
counts,
_mm_and_si128(
is_leading_utf8_byte(mm_from_offset(utf8_chars, utf8_chars.len() - 16)),
mm_from_offset(&MASK, utf8_chars.len() % 16)
)
);
}
count += sum(&counts);
count
}