From 04faab38787fb5c262fca8d77ae998ed5ff16e7b Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Sat, 2 Apr 2022 21:14:52 -0400 Subject: [PATCH] Properly normalize attribute values closes #371 --- src/errors.rs | 1 + src/escapei.rs | 2 +- src/events/attributes.rs | 119 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 1 deletion(-) diff --git a/src/errors.rs b/src/errors.rs index 79eb7899..369978d0 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -62,6 +62,7 @@ impl From for Error { } impl From for Error { + /// Creates a new `Error::InvalidAttr` from the given error #[inline] fn from(error: AttrError) -> Self { Error::InvalidAttr(error) diff --git a/src/escapei.rs b/src/escapei.rs index 64749c27..d331f442 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result, EscapeError> { } /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding -/// value, using a dictionnary of custom entities. +/// value, using a dictionary of custom entities. /// /// # Pre-condition /// diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 1d8e2583..5ad48465 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -32,6 +32,12 @@ pub struct Attribute<'a> { } impl<'a> Attribute<'a> { + pub fn normalized_value(&'a self) -> Result> { + let normalized = normalize_attribute_value(&*self.value); + let escaped = do_unescape(&*normalized, None).map_err(Error::EscapeError)?; + Ok(Cow::Owned(escaped.into_owned())) + } + /// Returns the unescaped value. /// /// This is normally the value you are interested in. Escape sequences such as `>` are @@ -289,6 +295,92 @@ impl<'a> From> for Attribute<'a> { } } +/// Normalize the attribute value according to xml specification section 3.3.3 +/// +/// https://www.w3.org/TR/xml/#AVNormalize +/// +/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value +/// * Sequences of whitespace-like characters are replaced with a single whitespace character +/// * Character and entity references are substituted as defined by the spec +fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> { + // TODO: character references, entity references, error handling associated with those + + #[derive(PartialEq)] + enum ParseState { + Space, + CDATA, + } + + // Trim characters from the beginning and end of the attribute value - this can't fail. + fn trim_value(attr: &[u8]) -> &[u8] { + let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' '); + + let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c)); + + if first_non_space_char.is_none() { + // The entire value was whitespace-like characters + return b""; + } + + let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c)); + + // Trim all whitespace-like characters away from the beginning and end of the attribute value. + let begin = first_non_space_char.unwrap(); + let end = last_non_space_char.unwrap_or(attr.len()); + &attr[begin..=end] + } + + let trimmed_attr = trim_value(attr); + + // A new buffer is only created when we encounter a situation that requires it. + let mut normalized: Option> = None; + // We start on character data because all whitespace-like characters are already trimmed away. + let mut current_state = ParseState::CDATA; + + // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference + // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new + // buffer and continue using this buffer. + for (idx, ch) in trimmed_attr.iter().enumerate() { + match ch { + b'\n' | b'\r' | b'\t' | b' ' => match current_state { + ParseState::Space => match normalized { + Some(_) => continue, + None => normalized = Some(Vec::from(&trimmed_attr[..idx])), + }, + ParseState::CDATA => { + current_state = ParseState::Space; + match normalized.as_mut() { + Some(buf) => buf.push(b' '), + None => { + let mut buf = Vec::from(&trimmed_attr[..idx]); + buf.push(b' '); + normalized = Some(buf); + } + } + } + }, + c @ _ => match current_state { + ParseState::Space => { + current_state = ParseState::CDATA; + if let Some(normalized) = normalized.as_mut() { + normalized.push(*c); + } + } + ParseState::CDATA => { + if let Some(normalized) = normalized.as_mut() { + normalized.push(*c); + } + } + }, + } + } + + match normalized { + Some(normalized) => Cow::Owned(normalized), + None => Cow::Borrowed(trimmed_attr), + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// Iterator over XML attributes. @@ -2353,4 +2445,31 @@ mod html { assert_eq!(iter.next(), None); assert_eq!(iter.next(), None); } + + #[test] + fn attribute_value_normalization() { + // empty value + assert_eq!(normalize_attribute_value(b"").as_ref(), b""); + // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character + assert_eq!( + normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(), + b"foo bar baz delta" + ); + // leading and trailing spaces must be stripped + assert_eq!(normalize_attribute_value(b" foo ").as_ref(), b"foo"); + // leading space + assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar"); + // trailing space + assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz"); + // sequences of spaces must be replaced with a single space + assert_eq!( + normalize_attribute_value(b" foo bar baz ").as_ref(), + b"foo bar baz" + ); + // sequence replacement mixed with characters treated as whitespace (\t \r \n) + assert_eq!( + normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(), + b"foo bar baz delta echo foxtrot" + ); + } }