Skip to content

Commit

Permalink
Use memchr to search for characters to escape
Browse files Browse the repository at this point in the history
  • Loading branch information
Dr-Emann committed Oct 12, 2023
1 parent ca1c09a commit 041113c
Show file tree
Hide file tree
Showing 3 changed files with 218 additions and 115 deletions.
39 changes: 23 additions & 16 deletions src/escapei.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
//! Manage xml character escapes
use memchr::memchr2_iter;
use memchr::{memchr2_iter, memchr3_iter};
use std::borrow::Cow;
use std::ops::Range;

use crate::utils::MergeIter;
#[cfg(test)]
use pretty_assertions::assert_eq;

Expand Down Expand Up @@ -72,7 +73,14 @@ impl std::error::Error for EscapeError {}
/// | `'` | `'`
/// | `"` | `"`
pub fn escape(raw: &str) -> Cow<str> {
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
let bytes = raw.as_bytes();
_escape(
raw,
MergeIter::new(
memchr3_iter(b'<', b'>', b'&', bytes),
memchr2_iter(b'\'', b'"', bytes),
),
)
}

/// Escapes an `&str` and replaces xml special characters (`<`, `>`, `&`)
Expand All @@ -89,24 +97,23 @@ pub fn escape(raw: &str) -> Cow<str> {
/// | `>` | `&gt;`
/// | `&` | `&amp;`
pub fn partial_escape(raw: &str) -> Cow<str> {
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
_escape(raw, memchr3_iter(b'<', b'>', b'&', raw.as_bytes()))
}

/// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`,
/// `&`, `'`, `"`) with their corresponding xml escaped value.
pub(crate) fn _escape<F: Fn(u8) -> bool>(raw: &str, escape_chars: F) -> Cow<str> {
pub(crate) fn _escape<It>(raw: &str, escapes: It) -> Cow<str>
where
It: Iterator<Item = usize>,
{
let bytes = raw.as_bytes();
let mut escaped = None;
let mut iter = bytes.iter();
let mut pos = 0;
while let Some(i) = iter.position(|&b| escape_chars(b)) {
if escaped.is_none() {
escaped = Some(Vec::with_capacity(raw.len()));
}
let escaped = escaped.as_mut().expect("initialized");
let new_pos = pos + i;
escaped.extend_from_slice(&bytes[pos..new_pos]);
match bytes[new_pos] {
let mut last_pos = 0;
for i in escapes {
let escaped = escaped.get_or_insert_with(|| Vec::with_capacity(raw.len()));
let byte = bytes[i];
escaped.extend_from_slice(&bytes[last_pos..i]);
match byte {
b'<' => escaped.extend_from_slice(b"&lt;"),
b'>' => escaped.extend_from_slice(b"&gt;"),
b'\'' => escaped.extend_from_slice(b"&apos;"),
Expand All @@ -124,11 +131,11 @@ pub(crate) fn _escape<F: Fn(u8) -> bool>(raw: &str, escape_chars: F) -> Cow<str>
"Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"
),
}
pos = new_pos + 1;
last_pos = i + 1;
}

if let Some(mut escaped) = escaped {
if let Some(raw) = bytes.get(pos..) {
if let Some(raw) = bytes.get(last_pos..) {
escaped.extend_from_slice(raw);
}
// SAFETY: we operate on UTF-8 input and search for an one byte chars only,
Expand Down
237 changes: 138 additions & 99 deletions src/se/simple_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
use crate::errors::serialize::DeError;
use crate::escapei::_escape;
use crate::se::{Indent, QuoteLevel};
use crate::utils::MergeIter;
use memchr::{memchr2_iter, memchr3_iter, memchr_iter};
use serde::ser::{
Impossible, Serialize, SerializeSeq, SerializeTuple, SerializeTupleStruct, Serializer,
};
Expand All @@ -29,67 +31,96 @@ fn escape_item(value: &str, target: QuoteTarget, level: QuoteLevel) -> Cow<str>
use QuoteLevel::*;
use QuoteTarget::*;

let bytes = value.as_bytes();

match (target, level) {
(_, Full) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items, cannot be used in the item
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' | b'>' | b'\'' | b'\"' => true,
_ => false,
}),
(_, Full) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<', '>', '\'', '"': Required characters to escape
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr3_iter(b'>', b'\'', b'"', bytes),
),
),
//----------------------------------------------------------------------
(Text, Partial) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items, cannot be used in the item
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' | b'>' => true,
_ => false,
}),
(Text, Minimal) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items, cannot be used in the item
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' => true,
_ => false,
}),
(Text, Partial) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<', '>': Required characters to escape
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr_iter(b'>', bytes),
),
),
(Text, Minimal) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<': Required characters to escape
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
),
//----------------------------------------------------------------------
(DoubleQAttr, Partial) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items, cannot be used in the item
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' | b'>' => true,
// Double quoted attribute should escape quote
b'"' => true,
_ => false,
}),
(DoubleQAttr, Minimal) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items, cannot be used in the item
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' => true,
// Double quoted attribute should escape quote
b'"' => true,
_ => false,
}),
(DoubleQAttr, Partial) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<', '>': Required characters to escape
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr2_iter(b'>', b'"', bytes),
),
),
(DoubleQAttr, Minimal) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<': Required characters to escape
// '"': Double quoted attribute should escape quote
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr_iter(b'"', bytes),
),
),
//----------------------------------------------------------------------
(SingleQAttr, Partial) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' | b'>' => true,
// Single quoted attribute should escape quote
b'\'' => true,
_ => false,
}),
(SingleQAttr, Minimal) => _escape(value, |ch| match ch {
// Spaces used as delimiters of list items
b' ' | b'\r' | b'\n' | b'\t' => true,
// Required characters to escape
b'&' | b'<' => true,
// Single quoted attribute should escape quote
b'\'' => true,
_ => false,
}),
(SingleQAttr, Partial) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<', '>': Required characters to escape
// '\'': Single quoted attribute should escape quote
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr2_iter(b'>', b'\'', bytes),
),
),
(SingleQAttr, Minimal) => _escape(
value,
// ' ', '\r', '\n', '\t': Spaces used as delimiters of list items, cannot be used in the item
// '&', '<': Required characters to escape
// '\'': Single quoted attribute should escape quote
MergeIter::new(
MergeIter::new(
memchr3_iter(b' ', b'\r', b'\n', bytes),
memchr3_iter(b'\t', b'&', b'<', bytes),
),
memchr_iter(b'\'', bytes),
),
),
}
}

Expand All @@ -98,53 +129,61 @@ fn escape_list(value: &str, target: QuoteTarget, level: QuoteLevel) -> Cow<str>
use QuoteLevel::*;
use QuoteTarget::*;

let bytes = value.as_bytes();

match (target, level) {
(_, Full) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' | b'>' | b'\'' | b'\"' => true,
_ => false,
}),
(_, Full) => _escape(
value,
// '&', '<', '>', '\'', '"': Required characters to escape
MergeIter::new(
memchr3_iter(b'&', b'<', b'>', bytes),
memchr2_iter(b'\'', b'"', bytes),
),
),
//----------------------------------------------------------------------
(Text, Partial) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' | b'>' => true,
_ => false,
}),
(Text, Minimal) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' => true,
_ => false,
}),
(Text, Partial) => _escape(
value,
// '&', '<', '>': Required characters to escape
memchr3_iter(b'&', b'<', b'>', bytes),
),
(Text, Minimal) => _escape(
value,
// '&', '<': Required characters to escape
memchr2_iter(b'&', b'<', bytes),
),
//----------------------------------------------------------------------
(DoubleQAttr, Partial) => _escape(value, |ch| match ch {
(DoubleQAttr, Partial) => _escape(
value,
// '&', '<', '>': Required characters to escape
// '"': Double quoted attribute should escape quote
MergeIter::new(
memchr3_iter(b'&', b'<', b'>', bytes),
memchr_iter(b'"', bytes),
),
),
(DoubleQAttr, Minimal) => _escape(
value,
// '&', '<': Required characters to escape
// '"': Double quoted attribute should escape quote
// Required characters to escape
b'&' | b'<' | b'>' => true,
// Double quoted attribute should escape quote
b'"' => true,
_ => false,
}),
(DoubleQAttr, Minimal) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' => true,
// Double quoted attribute should escape quote
b'"' => true,
_ => false,
}),
memchr3_iter(b'&', b'<', b'"', bytes),
),
//----------------------------------------------------------------------
(SingleQAttr, Partial) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' | b'>' => true,
// Single quoted attribute should escape quote
b'\'' => true,
_ => false,
}),
(SingleQAttr, Minimal) => _escape(value, |ch| match ch {
// Required characters to escape
b'&' | b'<' => true,
// Single quoted attribute should escape quote
b'\'' => true,
_ => false,
}),
(SingleQAttr, Partial) => _escape(
value,
// '&', '<', '>': Required characters to escape
// '\'': Single quoted attribute should escape quote
MergeIter::new(
memchr3_iter(b'&', b'<', b'>', bytes),
memchr_iter(b'\'', bytes),
),
),
(SingleQAttr, Minimal) => _escape(
value,
// '&', '<': Required characters to escape
// '\': Single quoted attribute should escape quote
memchr3_iter(b'&', b'<', b'\'', bytes),
),
}
}

Expand Down
Loading

0 comments on commit 041113c

Please sign in to comment.