Skip to content

Commit

Permalink
Faster escape routines
Browse files Browse the repository at this point in the history
  • Loading branch information
dralley committed Jun 29, 2022
1 parent 0febc2b commit af30446
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 20 deletions.
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ document-features = { version = "0.2", optional = true }
encoding_rs = { version = "0.8", optional = true }
serde = { version = "1.0", optional = true }
memchr = "2.5"
jetscii = "0.5.2"
once_cell = "1.12.0"

[dev-dependencies]
criterion = "0.3"
Expand Down
72 changes: 52 additions & 20 deletions src/escapei.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
//! Manage xml character escapes
use memchr;
use std::borrow::Cow;
use std::collections::HashMap;
use std::ops::Range;

use jetscii::bytes;
use memchr;
use once_cell::sync::Lazy;

#[cfg(test)]
use pretty_assertions::assert_eq;

static XML_ESCAPE_BYTES: Lazy<jetscii::BytesConst> =
Lazy::new(|| bytes!(b'<', b'>', b'&', b'\'', b'"'));
static XML_PARTIAL_ESCAPE_BYTES: Lazy<jetscii::BytesConst> = Lazy::new(|| bytes!(b'<', b'>', b'&'));

/// Error for XML escape/unescqpe.
#[derive(Debug)]
pub enum EscapeError {
Expand Down Expand Up @@ -66,31 +73,17 @@ impl std::error::Error for EscapeError {}
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
/// corresponding xml escaped value.
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'\'' | b'&' | b'"' => true,
_ => false,
}
}

_escape(raw, to_escape)
// _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'\'' | b'&' | b'"'))
simd_escape(raw, &XML_ESCAPE_BYTES)
}

/// Should only be used for escaping text content. In xml text content, it is allowed
/// (though not recommended) to leave the quote special characters " and ' unescaped.
/// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
/// their corresponding xml escaped value, but does not escape quote characters.
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'&' => true,
_ => false,
}
}

_escape(raw, to_escape)
// _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
simd_escape(raw, &XML_PARTIAL_ESCAPE_BYTES)
}

/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
Expand All @@ -112,7 +105,46 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
b'\'' => escaped.extend_from_slice(b"&apos;"),
b'&' => escaped.extend_from_slice(b"&amp;"),
b'"' => escaped.extend_from_slice(b"&quot;"),
_ => unreachable!("Only '<', '>','\', '&' and '\"' are escaped"),
c @ _ => unreachable!(
"Found {} but only '<', '>', ', '&' and '\"' are escaped",
c as char
),
}
pos = new_pos + 1;
}

if let Some(mut escaped) = escaped {
if let Some(raw) = raw.get(pos..) {
escaped.extend_from_slice(raw);
}
Cow::Owned(escaped)
} else {
Cow::Borrowed(raw)
}
}

/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
/// corresponding xml escaped value.
pub fn simd_escape<'a>(raw: &'a [u8], escape_matcher: &jetscii::BytesConst) -> Cow<'a, [u8]> {
let mut escaped = None;
let mut pos = 0;
while let Some(i) = escape_matcher.find(&raw[pos..]) {
if escaped.is_none() {
escaped = Some(Vec::with_capacity(raw.len()));
}
let escaped = escaped.as_mut().expect("initialized");
let new_pos = pos + i;
escaped.extend_from_slice(&raw[pos..new_pos]);
match raw[new_pos] {
b'<' => escaped.extend_from_slice(b"&lt;"),
b'>' => escaped.extend_from_slice(b"&gt;"),
b'\'' => escaped.extend_from_slice(b"&apos;"),
b'&' => escaped.extend_from_slice(b"&amp;"),
b'"' => escaped.extend_from_slice(b"&quot;"),
c @ _ => unreachable!(
"Found {} but only '<', '>', ', '&' and '\"' are escaped",
c as char
),
}
pos = new_pos + 1;
}
Expand Down

0 comments on commit af30446

Please sign in to comment.