diff --git a/Changelog.md b/Changelog.md index 8b30ab08..f891dd3d 100644 --- a/Changelog.md +++ b/Changelog.md @@ -12,6 +12,8 @@ ### New Features +- [#379]: Support for the full XML attribute value normalization process + ### Bug Fixes - [#537]: Restore ability to deserialize attributes that represents XML namespace @@ -115,6 +117,11 @@ - [#489]: Reduced the size of the package uploaded into the crates.io by excluding tests, examples, and benchmarks. +### New Tests + +- [#379]: Added tests for attribute value normalization + +[#379]: https://github.com/tafia/quick-xml/pull/379 [#481]: https://github.com/tafia/quick-xml/pull/481 [#489]: https://github.com/tafia/quick-xml/pull/489 @@ -155,6 +162,9 @@ - [#395]: Add support for XML Schema `xs:list` - [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore the XML declared encoding and always use UTF-8 +- [#379]: Add full support for XML attribute value normalization via + `Attribute::normalized_value()` and + `Attribute::normalized_value_with_custom_entities()` - [#416]: Add `borrow()` methods in all event structs which allows to get a borrowed version of any event - [#437]: Split out namespace reading functionality to a dedicated `NsReader`, namely: diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index a8cbbd53..0d717db5 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[ ("players.xml", PLAYERS), ]; -// TODO: use fully normalized attribute values fn parse_document_from_str(doc: &str) -> XmlResult<()> { let mut r = Reader::from_str(doc); loop { match criterion::black_box(r.read_event()?) { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - criterion::black_box(attr?.decode_and_unescape_value(&r)?); + criterion::black_box(attr?.normalized_value()?); } } Event::Text(e) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index aa5c8b70..4599ddfa 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -242,6 +242,130 @@ fn attributes(c: &mut Criterion) { assert_eq!(count, 150); }) }); + + group.finish(); +} + +/// Benchmarks normalizing attribute values +fn attribute_value_normalization(c: &mut Criterion) { + let mut group = c.benchmark_group("attribute_value_normalization"); + + group.bench_function("noop_short", |b| { + b.iter(|| { + criterion::black_box(unescape(b"foobar")).unwrap(); + }) + }); + + group.bench_function("noop_long", |b| { + b.iter(|| { + criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap(); + }) + }); + + group.bench_function("replacement_chars", |b| { + b.iter(|| { + criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap(); + }) + }); + + group.bench_function("char_reference", |b| { + b.iter(|| { + let text = b"prefix "some stuff","more stuff""; + criterion::black_box(unescape(text)).unwrap(); + let text = b"&<"; + criterion::black_box(unescape(text)).unwrap(); + }) + }); + + group.bench_function("entity_reference", |b| { + b.iter(|| { + let text = b"age > 72 && age < 21"; + criterion::black_box(unescape(text)).unwrap(); + let text = b""what's that?""; + criterion::black_box(unescape(text)).unwrap(); + }) + }); + + group.finish(); +} + +/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event` +/// benchmark) +fn bytes_text_unescaped(c: &mut Criterion) { + let mut group = c.benchmark_group("BytesText::unescaped"); + group.bench_function("trim_text = false", |b| { + b.iter(|| { + let mut buf = Vec::new(); + let mut r = Reader::from_reader(SAMPLE); + r.check_end_names(false).check_comments(false); + let mut count = criterion::black_box(0); + let mut nbtxt = criterion::black_box(0); + loop { + match r.read_event(&mut buf) { + Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, + Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), + Ok(Event::Eof) => break, + _ => (), + } + buf.clear(); + } + assert_eq!( + count, 1550, + "Overall tag count in ./tests/documents/sample_rss.xml" + ); + + // Windows has \r\n instead of \n + #[cfg(windows)] + assert_eq!( + nbtxt, 67661, + "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" + ); + + #[cfg(not(windows))] + assert_eq!( + nbtxt, 66277, + "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" + ); + }); + }); + + group.bench_function("trim_text = true", |b| { + b.iter(|| { + let mut buf = Vec::new(); + let mut r = Reader::from_reader(SAMPLE); + r.check_end_names(false) + .check_comments(false) + .trim_text(true); + let mut count = criterion::black_box(0); + let mut nbtxt = criterion::black_box(0); + loop { + match r.read_event(&mut buf) { + Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, + Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), + Ok(Event::Eof) => break, + _ => (), + } + buf.clear(); + } + assert_eq!( + count, 1550, + "Overall tag count in ./tests/documents/sample_rss.xml" + ); + + // Windows has \r\n instead of \n + #[cfg(windows)] + assert_eq!( + nbtxt, 50334, + "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" + ); + + #[cfg(not(windows))] + assert_eq!( + nbtxt, 50261, + "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" + ); + }); + }); group.finish(); } @@ -354,6 +478,7 @@ criterion_group!( read_resolved_event_into, one_event, attributes, + attribute_value_normalization, escaping, unescaping, ); diff --git a/src/errors.rs b/src/errors.rs index 0afd5a02..f0be93ef 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -77,6 +77,7 @@ impl From for Error { } impl From for Error { + /// Creates a new `Error::InvalidAttr` from the given error #[inline] fn from(error: AttrError) -> Self { Error::InvalidAttr(error) diff --git a/src/escapei.rs b/src/escapei.rs index f537233a..53def45e 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -8,7 +8,7 @@ use std::ops::Range; use pretty_assertions::assert_eq; /// Error for XML escape / unescape. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub enum EscapeError { /// Entity with Null character EntityWithNull(Range), @@ -213,7 +213,7 @@ where } #[cfg(not(feature = "escape-html"))] -fn named_entity(name: &str) -> Option<&str> { +pub(crate) fn named_entity(name: &str) -> Option<&str> { // match over strings are not allowed in const functions let s = match name.as_bytes() { b"lt" => "<", @@ -226,11 +226,10 @@ fn named_entity(name: &str) -> Option<&str> { Some(s) } #[cfg(feature = "escape-html")] -fn named_entity(name: &str) -> Option<&str> { +pub(crate) fn named_entity(name: &str) -> Option<&str> { // imported from https://dev.w3.org/html5/html-author/charref - // match over strings are not allowed in const functions //TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json - let s = match name.as_bytes() { + match name.as_bytes() { b"Tab" => "\u{09}", b"NewLine" => "\u{0A}", b"excl" => "\u{21}", @@ -1690,7 +1689,7 @@ fn named_entity(name: &str) -> Option<&str> { Some(s) } -fn parse_number(bytes: &str, range: Range) -> Result { +pub(crate) fn parse_number(bytes: &str, range: Range) -> Result { let code = if bytes.starts_with('x') { parse_hexadecimal(&bytes[1..]) } else { @@ -1705,7 +1704,7 @@ fn parse_number(bytes: &str, range: Range) -> Result { } } -fn parse_hexadecimal(bytes: &str) -> Result { +pub(crate) fn parse_hexadecimal(bytes: &str) -> Result { // maximum code is 0x10FFFF => 6 characters if bytes.len() > 6 { return Err(EscapeError::TooLongHexadecimal); @@ -1723,7 +1722,7 @@ fn parse_hexadecimal(bytes: &str) -> Result { Ok(code) } -fn parse_decimal(bytes: &str) -> Result { +pub(crate) fn parse_decimal(bytes: &str) -> Result { // maximum code is 0x10FFFF = 1114111 => 7 characters if bytes.len() > 7 { return Err(EscapeError::TooLongDecimal); diff --git a/src/events/attributes.rs b/src/events/attributes.rs index e2cf8c62..5579ff29 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -4,6 +4,7 @@ use crate::errors::Result as XmlResult; use crate::escape::{escape, unescape_with}; +use crate::escapei::{self, EscapeError}; use crate::name::QName; use crate::reader::{is_whitespace, Reader}; use crate::utils::{write_byte_string, write_cow_string, Bytes}; @@ -30,7 +31,85 @@ pub struct Attribute<'a> { } impl<'a> Attribute<'a> { - /// Decodes using UTF-8 then unescapes the value. + /// Returns the attribute value normalized as per the XML specification. + /// + /// https://www.w3.org/TR/xml/#AVNormalize + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters \t, \r, \n are replaced with whitespace characters. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`normalized_value_with_custom_entities()`](#method.normalized_value_with_custom_entities) + pub fn normalized_value(&'a self) -> Result, EscapeError> { + self.normalized_value_with(|_| None) + } + + /// Returns the attribute value normalized as per the XML specification, using custom entities. + /// + /// https://www.w3.org/TR/xml/#AVNormalize + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters \t, \r, \n are replaced with whitespace characters. + /// Additional entities can be provided in `custom_entities`. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`normalized_value()`](#method.normalized_value) + /// + /// # Pre-condition + /// + /// The keys and values of `custom_entities`, if any, must be valid UTF-8. + pub fn normalized_value_with<'entity>( + &'a self, + resolve_entity: impl Fn(&str) -> Option<&'entity str>, + ) -> Result, EscapeError> { + // TODO: avoid allocation when not needed + let mut normalized = String::with_capacity(self.value.len()); + + let attr = self.value.as_ref(); + let mut attr_iter = attr.iter().enumerate(); + + while let Some((idx, ch)) = attr_iter.next() { + match ch { + b' ' | b'\n' | b'\r' | b'\t' => normalized.push(' '), + b'&' => { + let end = idx + + 1 + + attr_iter + .position(|(_, c)| *c == b';') + .ok_or_else(|| EscapeError::UnterminatedEntity(idx..attr.len()))?; + let entity = &attr[idx + 1..end]; // starts after the & + let entity_str = std::str::from_utf8(entity).expect("failed UTF-8 check"); + + if let Some(s) = escapei::named_entity(entity_str) { + normalized.push_str(s); + } else if entity.starts_with(b"#") { + let entity = &entity_str[1..]; // starts after the # + let codepoint = escapei::parse_number(entity_str, idx..end)?; + normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4])); + } else if let Some(value) = resolve_entity(entity_str) { + // TODO: recursively apply entity substitution + normalized.push_str(&value); + } else { + return Err(EscapeError::UnrecognizedSymbol( + idx + 1..end, + String::from_utf8(entity.to_vec()).expect("failed UTF-8 check"), + )); + } + } + _ => normalized.push(*ch as char), + } + } + + Ok(Cow::Owned(normalized)) + } + + /// Returns the unescaped value. /// /// This is normally the value you are interested in. Escape sequences such as `>` are /// replaced with their unescaped equivalents such as `>`. @@ -791,6 +870,57 @@ mod xml { use super::*; use pretty_assertions::assert_eq; + #[test] + fn attribute_value_normalization() { + // empty value + let raw_value = "".as_bytes(); + let output = "".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); + + // return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with a space character + let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes(); + let output = " foo bar baz delta ".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); + + // entities must be terminated + let raw_value = "abc"def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.normalized_value(), + Err(EscapeError::UnterminatedEntity(3..11)) + ); + + // unknown entities raise error + let raw_value = "abc&unkn;def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.normalized_value(), + Err(EscapeError::UnrecognizedSymbol(4..8, "unkn".to_owned())) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedSymbol appropriate. shared with unescape code + ); + + // // custom entity replacement works, entity replacement text processed recursively + // let raw_value = "&d;&d;A&a; &a;B&da;".as_bytes(); + // let output = b" A B ".to_vec(); + // let attr = Attribute::from(("foo".as_bytes(), raw_value)); + // let mut custom_entities = HashMap::new(); + // custom_entities.insert(b"d".to_vec(), b" ".to_vec()); + // custom_entities.insert(b"a".to_vec(), b" ".to_vec()); + // custom_entities.insert(b"da".to_vec(), b" ".to_vec()); + // dbg!(std::str::from_utf8(attr.normalized_value_with_custom_entities(&custom_entities).unwrap().as_ref()).unwrap()); + // assert_eq!( + // attr.normalized_value_with_custom_entities(&custom_entities), + // Ok(Cow::Owned::<[u8]>(output)) + // ); + + // character literal references are substituted without being replaced by spaces + let raw_value = " A B ".as_bytes(); + let output = "\r\rA\n\nB\r\n".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); + } + /// Checked attribute is the single attribute mod single { use super::*; diff --git a/src/lib.rs b/src/lib.rs index 5e8a20c2..992928fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,8 +51,7 @@ mod errors; mod escapei; pub mod escape { //! Manage xml character escapes - pub(crate) use crate::escapei::EscapeError; - pub use crate::escapei::{escape, partial_escape, unescape, unescape_with}; + pub use crate::escapei::{escape, partial_escape, unescape, unescape_with, EscapeError}; } pub mod events; pub mod name;