use crate::mem;
use super::Utf8Error;
#[inline]
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7F >> width)) as u32
}
#[inline]
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}
#[inline]
pub(super) fn utf8_is_cont_byte(byte: u8) -> bool {
(byte & !CONT_MASK) == TAG_CONT_U8
}
#[inline]
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
match opt {
Some(&byte) => byte,
None => 0,
}
}
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
let x = *bytes.next()?;
if x < 128 {
return Some(x as u32);
}
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(bytes.next());
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
let z = unwrap_or_0(bytes.next());
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
let w = unwrap_or_0(bytes.next());
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
Some(ch)
}
#[inline]
pub(super) fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
where
I: DoubleEndedIterator<Item = &'a u8>,
{
let w = match *bytes.next_back()? {
next_byte if next_byte < 128 => return Some(next_byte as u32),
back_byte => back_byte,
};
let mut ch;
let z = unwrap_or_0(bytes.next_back());
ch = utf8_first_byte(z, 2);
if utf8_is_cont_byte(z) {
let y = unwrap_or_0(bytes.next_back());
ch = utf8_first_byte(y, 3);
if utf8_is_cont_byte(y) {
let x = unwrap_or_0(bytes.next_back());
ch = utf8_first_byte(x, 4);
ch = utf8_acc_cont_byte(ch, y);
}
ch = utf8_acc_cont_byte(ch, z);
}
ch = utf8_acc_cont_byte(ch, w);
Some(ch)
}
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
#[inline]
fn contains_nonascii(x: usize) -> bool {
(x & NONASCII_MASK) != 0
}
#[inline(always)]
pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut index = 0;
let len = v.len();
let usize_bytes = mem::size_of::<usize>();
let ascii_block_size = 2 * usize_bytes;
let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
let align = v.as_ptr().align_offset(usize_bytes);
while index < len {
let old_offset = index;
macro_rules! err {
($error_len: expr) => {
return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len })
};
}
macro_rules! next {
() => {{
index += 1;
if index >= len {
err!(None)
}
v[index]
}};
}
let first = v[index];
if first >= 128 {
let w = UTF8_CHAR_WIDTH[first as usize];
match w {
2 => {
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(1))
}
}
3 => {
match (first, next!()) {
(0xE0, 0xA0..=0xBF)
| (0xE1..=0xEC, 0x80..=0xBF)
| (0xED, 0x80..=0x9F)
| (0xEE..=0xEF, 0x80..=0xBF) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
}
4 => {
match (first, next!()) {
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(3))
}
}
_ => err!(Some(1)),
}
index += 1;
} else {
if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
let ptr = v.as_ptr();
while index < blocks_end {
unsafe {
let block = ptr.add(index) as *const usize;
let zu = contains_nonascii(*block);
let zv = contains_nonascii(*block.offset(1));
if zu | zv {
break;
}
}
index += ascii_block_size;
}
while index < len && v[index] < 128 {
index += 1;
}
} else {
index += 1;
}
}
}
Ok(())
}
static UTF8_CHAR_WIDTH: [u8; 256] = [
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub fn utf8_char_width(b: u8) -> usize {
UTF8_CHAR_WIDTH[b as usize] as usize
}
const CONT_MASK: u8 = 0b0011_1111;
const TAG_CONT_U8: u8 = 0b1000_0000;
pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
if max >= s.len() {
(false, s)
} else {
while !s.is_char_boundary(max) {
max -= 1;
}
(true, &s[..max])
}
}