Skip to content

Commit

Permalink
Properly normalize attribute values
Browse files Browse the repository at this point in the history
closes tafia#371
  • Loading branch information
dralley committed Jun 19, 2022
1 parent e701c4d commit da140b9
Show file tree
Hide file tree
Showing 3 changed files with 220 additions and 1 deletion.
1 change: 1 addition & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ impl From<EscapeError> for Error {
}

impl From<AttrError> for Error {
/// Creates a new `Error::InvalidAttr` from the given error
#[inline]
fn from(error: AttrError) -> Self {
Error::InvalidAttr(error)
Expand Down
2 changes: 1 addition & 1 deletion src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using a dictionnary of custom entities.
/// value, using a dictionary of custom entities.
///
/// # Pre-condition
///
Expand Down
218 changes: 218 additions & 0 deletions src/events/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,197 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
}
}

/// Normalize the attribute value according to xml specification section 3.3.3
///
/// https://www.w3.org/TR/xml/#AVNormalize
///
/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
/// * Sequences of whitespace-like characters are replaced with a single whitespace character
/// * Character and entity references are substituted as defined by the spec
fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
// TODO: character references, entity references, error handling associated with those

#[derive(PartialEq)]
enum ParseState {
Space,
CDATA,
}

let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' ');

let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c));

if first_non_space_char.is_none() {
// The entire value was whitespace-like characters
return Cow::Borrowed(b"");
}

let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c));

// Trim all whitespace-like characters away from the beginning and end of the attribute value.
let begin = first_non_space_char.unwrap();
let end = last_non_space_char.unwrap_or(attr.len());
let trimmed_attr = &attr[begin..=end];

// A new buffer is only created when we encounter a situation that requires it.
let mut normalized: Option<Vec<u8>> = None;
// We start on character data because all whitespace-like characters are already trimmed away.
let mut current_state = ParseState::CDATA;

// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
// buffer and continue using this buffer.
for (idx, ch) in trimmed_attr.iter().enumerate() {
match ch {
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
ParseState::Space => match normalized {
Some(_) => continue,
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
},
ParseState::CDATA => {
current_state = ParseState::Space;
match normalized.as_mut() {
Some(buf) => buf.push(b' '),
None => {
let mut buf = Vec::from(&trimmed_attr[..idx]);
buf.push(b' ');
normalized = Some(buf);
}
}
}
},
c @ _ => match current_state {
ParseState::Space => {
current_state = ParseState::CDATA;
if let Some(normalized) = normalized.as_mut() {
normalized.push(*c);
}
}
ParseState::CDATA => {
if let Some(normalized) = normalized.as_mut() {
normalized.push(*c);
}
}
},
}
}

match normalized {
Some(normalized) => Cow::Owned(normalized),
None => Cow::Borrowed(trimmed_attr),
}
}

impl<'a> Iterator for Attributes<'a> {
type Item = Result<Attribute<'a>>;
fn next(&mut self) -> Option<Self::Item> {
let len = self.bytes.len();

macro_rules! err {
($err:expr) => {{
self.position = len;
return Some(Err($err.into()));
}};
}

macro_rules! attr {
($key:expr) => {{
self.position = len;
if self.html {
attr!($key, 0..0)
} else {
None
}
}};
($key:expr, $val:expr) => {
Some(Ok(Attribute {
key: &self.bytes[$key],
value: normalize_attribute_value(&self.bytes[$val]),
}))
};
}

if len <= self.position {
return None;
}

let mut bytes = self.bytes.iter().enumerate().skip(self.position);

// key starts after the whitespace
let start_key = match bytes
.by_ref()
.skip_while(|&(_, &b)| !is_whitespace(b))
.find(|&(_, &b)| !is_whitespace(b))
{
Some((i, _)) => i,
None => return attr!(self.position..len),
};

// key ends with either whitespace or =
let end_key = match bytes
.by_ref()
.find(|&(_, &b)| b == b'=' || is_whitespace(b))
{
Some((i, &b'=')) => i,
Some((i, &b'\'')) | Some((i, &b'"')) if self.with_checks => {
err!(Error::NameWithQuote(i));
}
Some((i, _)) => {
// consume until `=` or return if html
match bytes.by_ref().find(|&(_, &b)| !is_whitespace(b)) {
Some((_, &b'=')) => i,
Some((j, _)) if self.html => {
self.position = j - 1;
return attr!(start_key..i, 0..0);
}
Some((j, _)) => err!(Error::NoEqAfterName(j)),
None if self.html => {
self.position = len;
return attr!(start_key..len, 0..0);
}
None => err!(Error::NoEqAfterName(len)),
}
}
None => return attr!(start_key..len),
};

if self.with_checks {
if let Some(start) = self
.consumed
.iter()
.filter(|r| r.len() == end_key - start_key)
.find(|r| self.bytes[(*r).clone()] == self.bytes[start_key..end_key])
.map(|ref r| r.start)
{
err!(Error::DuplicatedAttribute(start_key, start));
}
self.consumed.push(start_key..end_key);
}

// value has quote if not html
match bytes.by_ref().find(|&(_, &b)| !is_whitespace(b)) {
Some((i, quote @ &b'\'')) | Some((i, quote @ &b'"')) => {
match bytes.by_ref().find(|&(_, &b)| b == *quote) {
Some((j, _)) => {
self.position = j + 1;
return attr!(start_key..end_key, i + 1..j);
}
None => err!(Error::UnquotedValue(i)),
}
}
Some((i, _)) if self.html => {
let j = bytes
.by_ref()
.find(|&(_, &b)| is_whitespace(b))
.map_or(len, |(j, _)| j);
self.position = j;
return attr!(start_key..end_key, i..j);
Some((i, _)) => err!(Error::UnquotedValue(i)),
None => return attr!(start_key..end_key),
}
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Iterator over XML attributes.
Expand Down Expand Up @@ -2353,4 +2544,31 @@ mod html {
assert_eq!(iter.next(), None);
assert_eq!(iter.next(), None);
}

#[test]
fn attribute_value_normalization() {
// empty value
assert_eq!(normalize_attribute_value(b"").as_ref(), b"");
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
assert_eq!(
normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(),
b"foo bar baz delta"
);
// leading and trailing spaces must be stripped
assert_eq!(normalize_attribute_value(b" foo ").as_ref(), b"foo");
// leading space
assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar");
// trailing space
assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz");
// sequences of spaces must be replaced with a single space
assert_eq!(
normalize_attribute_value(b" foo bar baz ").as_ref(),
b"foo bar baz"
);
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
assert_eq!(
normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(),
b"foo bar baz delta echo foxtrot"
);
}
}

0 comments on commit da140b9

Please sign in to comment.