Properly normalize attribute values

closes tafia#371
dralley · Jun 19, 2022 · da140b9 · da140b9
1 parent e701c4d
commit da140b9
Show file tree

Hide file tree

Showing 3 changed files with 220 additions and 1 deletion.
diff --git a/src/errors.rs b/src/errors.rs
@@ -62,6 +62,7 @@ impl From<EscapeError> for Error {
 }
 
 impl From<AttrError> for Error {
+    /// Creates a new `Error::InvalidAttr` from the given error
     #[inline]
     fn from(error: AttrError) -> Self {
         Error::InvalidAttr(error)

diff --git a/src/escapei.rs b/src/escapei.rs
@@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
 }
 
 /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
-/// value, using a dictionnary of custom entities.
+/// value, using a dictionary of custom entities.
 ///
 /// # Pre-condition
 ///

diff --git a/src/events/attributes.rs b/src/events/attributes.rs
@@ -289,6 +289,197 @@ impl<'a> From<Attr<&'a [u8]>> for Attribute<'a> {
     }
 }
 
+/// Normalize the attribute value according to xml specification section 3.3.3
+///
+/// https://www.w3.org/TR/xml/#AVNormalize
+///
+/// * Whitespace-like characters (\r, \n, \t, ' ') are trimmed from the ends of the value
+/// * Sequences of whitespace-like characters are replaced with a single whitespace character
+/// * Character and entity references are substituted as defined by the spec
+fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
+    // TODO: character references, entity references, error handling associated with those
+
+    #[derive(PartialEq)]
+    enum ParseState {
+        Space,
+        CDATA,
+    }
+
+    let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' ');
+
+    let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c));
+
+    if first_non_space_char.is_none() {
+        // The entire value was whitespace-like characters
+        return Cow::Borrowed(b"");
+    }
+
+    let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c));
+
+    // Trim all whitespace-like characters away from the beginning and end of the attribute value.
+    let begin = first_non_space_char.unwrap();
+    let end = last_non_space_char.unwrap_or(attr.len());
+    let trimmed_attr = &attr[begin..=end];
+
+    // A new buffer is only created when we encounter a situation that requires it.
+    let mut normalized: Option<Vec<u8>> = None;
+    // We start on character data because all whitespace-like characters are already trimmed away.
+    let mut current_state = ParseState::CDATA;
+
+    // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
+    // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
+    // buffer and continue using this buffer.
+    for (idx, ch) in trimmed_attr.iter().enumerate() {
+        match ch {
+            b'\n' | b'\r' | b'\t' | b' ' => match current_state {
+                ParseState::Space => match normalized {
+                    Some(_) => continue,
+                    None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
+                },
+                ParseState::CDATA => {
+                    current_state = ParseState::Space;
+                    match normalized.as_mut() {
+                        Some(buf) => buf.push(b' '),
+                        None => {
+                            let mut buf = Vec::from(&trimmed_attr[..idx]);
+                            buf.push(b' ');
+                            normalized = Some(buf);
+                        }
+                    }
+                }
+            },
+            c @ _ => match current_state {
+                ParseState::Space => {
+                    current_state = ParseState::CDATA;
+                    if let Some(normalized) = normalized.as_mut() {
+                        normalized.push(*c);
+                    }
+                }
+                ParseState::CDATA => {
+                    if let Some(normalized) = normalized.as_mut() {
+                        normalized.push(*c);
+                    }
+                }
+            },
+        }
+    }
+
+    match normalized {
+        Some(normalized) => Cow::Owned(normalized),
+        None => Cow::Borrowed(trimmed_attr),
+    }
+}
+
+impl<'a> Iterator for Attributes<'a> {
+    type Item = Result<Attribute<'a>>;
+    fn next(&mut self) -> Option<Self::Item> {
+        let len = self.bytes.len();
+
+        macro_rules! err {
+            ($err:expr) => {{
+                self.position = len;
+                return Some(Err($err.into()));
+            }};
+        }
+
+        macro_rules! attr {
+            ($key:expr) => {{
+                self.position = len;
+                if self.html {
+                    attr!($key, 0..0)
+                } else {
+                    None
+                }
+            }};
+            ($key:expr, $val:expr) => {
+                Some(Ok(Attribute {
+                    key: &self.bytes[$key],
+                    value: normalize_attribute_value(&self.bytes[$val]),
+                }))
+            };
+        }
+
+        if len <= self.position {
+            return None;
+        }
+
+        let mut bytes = self.bytes.iter().enumerate().skip(self.position);
+
+        // key starts after the whitespace
+        let start_key = match bytes
+            .by_ref()
+            .skip_while(|&(_, &b)| !is_whitespace(b))
+            .find(|&(_, &b)| !is_whitespace(b))
+        {
+            Some((i, _)) => i,
+            None => return attr!(self.position..len),
+        };
+
+        // key ends with either whitespace or =
+        let end_key = match bytes
+            .by_ref()
+            .find(|&(_, &b)| b == b'=' || is_whitespace(b))
+        {
+            Some((i, &b'=')) => i,
+            Some((i, &b'\'')) | Some((i, &b'"')) if self.with_checks => {
+                err!(Error::NameWithQuote(i));
+            }
+            Some((i, _)) => {
+                // consume until `=` or return if html
+                match bytes.by_ref().find(|&(_, &b)| !is_whitespace(b)) {
+                    Some((_, &b'=')) => i,
+                    Some((j, _)) if self.html => {
+                        self.position = j - 1;
+                        return attr!(start_key..i, 0..0);
+                    }
+                    Some((j, _)) => err!(Error::NoEqAfterName(j)),
+                    None if self.html => {
+                        self.position = len;
+                        return attr!(start_key..len, 0..0);
+                    }
+                    None => err!(Error::NoEqAfterName(len)),
+                }
+            }
+            None => return attr!(start_key..len),
+        };
+
+        if self.with_checks {
+            if let Some(start) = self
+                .consumed
+                .iter()
+                .filter(|r| r.len() == end_key - start_key)
+                .find(|r| self.bytes[(*r).clone()] == self.bytes[start_key..end_key])
+                .map(|ref r| r.start)
+            {
+                err!(Error::DuplicatedAttribute(start_key, start));
+            }
+            self.consumed.push(start_key..end_key);
+        }
+
+        // value has quote if not html
+        match bytes.by_ref().find(|&(_, &b)| !is_whitespace(b)) {
+            Some((i, quote @ &b'\'')) | Some((i, quote @ &b'"')) => {
+                match bytes.by_ref().find(|&(_, &b)| b == *quote) {
+                    Some((j, _)) => {
+                        self.position = j + 1;
+                        return attr!(start_key..end_key, i + 1..j);
+                    }
+                    None => err!(Error::UnquotedValue(i)),
+                }
+            }
+            Some((i, _)) if self.html => {
+                let j = bytes
+                    .by_ref()
+                    .find(|&(_, &b)| is_whitespace(b))
+                    .map_or(len, |(j, _)| j);
+                self.position = j;
+                return attr!(start_key..end_key, i..j);
+            Some((i, _)) => err!(Error::UnquotedValue(i)),
+            None => return attr!(start_key..end_key),
+        }
+    }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Iterator over XML attributes.
@@ -2353,4 +2544,31 @@ mod html {
         assert_eq!(iter.next(), None);
         assert_eq!(iter.next(), None);
     }
+
+    #[test]
+    fn attribute_value_normalization() {
+        // empty value
+        assert_eq!(normalize_attribute_value(b"").as_ref(), b"");
+        // return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
+        assert_eq!(
+            normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(),
+            b"foo bar baz delta"
+        );
+        // leading and trailing spaces must be stripped
+        assert_eq!(normalize_attribute_value(b"  foo ").as_ref(), b"foo");
+        // leading space
+        assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar");
+        // trailing space
+        assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz");
+        // sequences of spaces must be replaced with a single space
+        assert_eq!(
+            normalize_attribute_value(b"   foo bar   baz ").as_ref(),
+            b"foo bar baz"
+        );
+        // sequence replacement mixed with characters treated as whitespace (\t \r \n)
+        assert_eq!(
+            normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(),
+            b"foo bar baz delta echo foxtrot"
+        );
+    }
 }