From 968d927fe24f1c62daa4d37d83bc5cd47a7aab67 Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 6 Jun 2024 23:19:48 +0500 Subject: [PATCH 1/9] Stop at the `>` in PiParser which is consistent with other search functions The parser search the end of processing instruction and this is the last byte of it --- src/reader/buffered_reader.rs | 6 +++--- src/reader/pi.rs | 24 +++++++++++------------- src/reader/slice_reader.rs | 8 ++++---- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 6436de3a..cbc6bb07 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -115,10 +115,10 @@ macro_rules! impl_buffered_source { match parser.feed(available) { Some(i) => { - // We does not include `>` in data - buf.extend_from_slice(&available[..i - 1]); + buf.extend_from_slice(&available[..i]); done = true; - i + // +1 for `>` which we do not include + i + 1 } None => { buf.extend_from_slice(available); diff --git a/src/reader/pi.rs b/src/reader/pi.rs index 7729b3ed..1b44596c 100644 --- a/src/reader/pi.rs +++ b/src/reader/pi.rs @@ -6,7 +6,7 @@ /// After successful search the parser will return [`Some`] with position where /// processing instruction is ended (the position after `?>`). If search was /// unsuccessful, a [`None`] will be returned. You typically would expect positive -/// result of search, so that you should feed new data until yo'll get it. +/// result of search, so that you should feed new data until you get it. /// /// NOTE: after successful match the parser does not returned to the initial /// state and should not be used anymore. Create a new parser if you want to perform @@ -25,9 +25,9 @@ /// // ...get new chunk of data /// assert_eq!(parser.feed(b" with = 'some > and ?"), None); /// // ...get another chunk of data -/// assert_eq!(parser.feed(b"' inside?>and the text follow..."), Some(10)); -/// // ^ ^ -/// // 0 10 +/// assert_eq!(parser.feed(b"' inside?>and the text follow..."), Some(9)); +/// // ^ ^ +/// // 0 9 /// ``` /// /// [`feed`]: Self::feed() @@ -56,11 +56,9 @@ impl PiParser { pub fn feed(&mut self, bytes: &[u8]) -> Option { for i in memchr::memchr_iter(b'>', bytes) { match i { - // +1 for `>` which should be included in event - 0 if self.0 => return Some(1), + 0 if self.0 => return Some(0), // If the previous byte is `?`, then we found `?>` - // +1 for `>` which should be included in event - i if i > 0 && bytes[i - 1] == b'?' => return Some(i + 1), + i if i > 0 && bytes[i - 1] == b'?' => return Some(i), _ => {} } } @@ -95,11 +93,11 @@ fn pi() { assert_eq!(parse_pi(b"?", true), Err(true)); // ?|? assert_eq!(parse_pi(b">", false), Err(false)); // x|> - assert_eq!(parse_pi(b">", true), Ok(1)); // ?|> + assert_eq!(parse_pi(b">", true), Ok(0)); // ?|> - assert_eq!(parse_pi(b"?>", false), Ok(2)); // x|?> - assert_eq!(parse_pi(b"?>", true), Ok(2)); // ?|?> + assert_eq!(parse_pi(b"?>", false), Ok(1)); // x|?> + assert_eq!(parse_pi(b"?>", true), Ok(1)); // ?|?> - assert_eq!(parse_pi(b">?>", false), Ok(3)); // x|>?> - assert_eq!(parse_pi(b">?>", true), Ok(1)); // ?|>?> + assert_eq!(parse_pi(b">?>", false), Ok(2)); // x|>?> + assert_eq!(parse_pi(b">?>", true), Ok(0)); // ?|>?> } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index b618ae65..78b5cb08 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -279,10 +279,10 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { let mut parser = PiParser::default(); if let Some(i) = parser.feed(self) { - *position += i; - // We does not include `>` in data - let bytes = &self[..i - 1]; - *self = &self[i..]; + // +1 for `>` which we do not include + *position += i + 1; + let bytes = &self[..i]; + *self = &self[i + 1..]; Ok((bytes, true)) } else { *position += self.len(); From 8d424764e1bed46a44d001339031cb221f11e4c6 Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 6 Jun 2024 22:22:22 +0500 Subject: [PATCH 2/9] Implement XmlSource::read_pi like XmlSource::read_element Return error from read_pi function instead of returning flag and later converting it to the error --- src/reader/buffered_reader.rs | 17 ++++++++++------- src/reader/mod.rs | 9 ++++----- src/reader/slice_reader.rs | 12 +++++------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index cbc6bb07..8aaa5baa 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -95,13 +95,12 @@ macro_rules! impl_buffered_source { &mut self, buf: &'b mut Vec, position: &mut usize, - ) -> Result<(&'b [u8], bool)> { + ) -> Result<&'b [u8]> { let mut parser = super::PiParser::default(); let mut read = 0; - let mut done = false; let start = buf.len(); - while !done { + loop { let used = { let available = match self $(.$reader)? .fill_buf() $(.$await)? { Ok(n) if n.is_empty() => break, @@ -116,9 +115,13 @@ macro_rules! impl_buffered_source { match parser.feed(available) { Some(i) => { buf.extend_from_slice(&available[..i]); - done = true; + // +1 for `>` which we do not include - i + 1 + self $(.$reader)? .consume(i + 1); + read += i + 1; + + *position += read; + return Ok(&buf[start..]); } None => { buf.extend_from_slice(available); @@ -129,9 +132,9 @@ macro_rules! impl_buffered_source { self $(.$reader)? .consume(used); read += used; } - *position += read; - Ok((&buf[start..], done)) + *position += read; + Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) } $($async)? fn read_bang_element $(<$lf>)? ( diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 6de103af..d5cfa237 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -364,14 +364,13 @@ macro_rules! read_until_close { .read_pi($buf, &mut $self.state.offset) $(.$await)? { - Ok((bytes, true)) => $self.state.emit_question_mark(bytes), - Ok((_, false)) => { + Ok(bytes) => $self.state.emit_question_mark(bytes), + Err(e) => { // We want to report error at `<`, but offset was increased, // so return it back (-1 for `<`) $self.state.last_error_offset = start - 1; - Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) + Err(e) } - Err(e) => Err(e), }, // `<...` - opening or self-closed tag Ok(Some(_)) => match $reader @@ -833,7 +832,7 @@ trait XmlSource<'r, B> { /// - `position`: Will be increased by amount of bytes consumed /// /// [events]: crate::events::Event - fn read_pi(&mut self, buf: B, position: &mut usize) -> Result<(&'r [u8], bool)>; + fn read_pi(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>; /// Read input until comment or CDATA is finished. /// diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 78b5cb08..ad2925b3 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -275,7 +275,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } } - fn read_pi(&mut self, _buf: (), position: &mut usize) -> Result<(&'a [u8], bool)> { + fn read_pi(&mut self, _buf: (), position: &mut usize) -> Result<&'a [u8]> { let mut parser = PiParser::default(); if let Some(i) = parser.feed(self) { @@ -283,13 +283,11 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { *position += i + 1; let bytes = &self[..i]; *self = &self[i + 1..]; - Ok((bytes, true)) - } else { - *position += self.len(); - let bytes = &self[..]; - *self = &[]; - Ok((bytes, false)) + return Ok(bytes); } + + *position += self.len(); + Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) } fn read_bang_element( From 91755fe7125ae2e24fa24f40b1ac6d9c16edb6a1 Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 6 Jun 2024 22:56:54 +0500 Subject: [PATCH 3/9] Use `if let` instead of `match` (Review in whitespace changes ignored mode) --- src/reader/buffered_reader.rs | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 8aaa5baa..8fbc0ba9 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -112,22 +112,19 @@ macro_rules! impl_buffered_source { } }; - match parser.feed(available) { - Some(i) => { - buf.extend_from_slice(&available[..i]); + if let Some(i) = parser.feed(available) { + buf.extend_from_slice(&available[..i]); - // +1 for `>` which we do not include - self $(.$reader)? .consume(i + 1); - read += i + 1; + // +1 for `>` which we do not include + self $(.$reader)? .consume(i + 1); + read += i + 1; - *position += read; - return Ok(&buf[start..]); - } - None => { - buf.extend_from_slice(available); - available.len() - } + *position += read; + return Ok(&buf[start..]); } + + buf.extend_from_slice(available); + available.len() }; self $(.$reader)? .consume(used); read += used; From 2ae8843cdb78a9442e3681e2149b5c55e8a10b5f Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 6 Jun 2024 23:01:01 +0500 Subject: [PATCH 4/9] Remove unnecessary block (Review in whitespace changes ignored mode) --- src/reader/buffered_reader.rs | 42 +++++++++++++++++------------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 8fbc0ba9..58bb24a2 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -101,31 +101,31 @@ macro_rules! impl_buffered_source { let mut read = 0; let start = buf.len(); loop { - let used = { - let available = match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => break, - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - }; + let available = match self $(.$reader)? .fill_buf() $(.$await)? { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e.into())); + } + }; - if let Some(i) = parser.feed(available) { - buf.extend_from_slice(&available[..i]); + if let Some(i) = parser.feed(available) { + buf.extend_from_slice(&available[..i]); - // +1 for `>` which we do not include - self $(.$reader)? .consume(i + 1); - read += i + 1; + // +1 for `>` which we do not include + self $(.$reader)? .consume(i + 1); + read += i + 1; - *position += read; - return Ok(&buf[start..]); - } + *position += read; + return Ok(&buf[start..]); + } - buf.extend_from_slice(available); - available.len() - }; + // The `>` symbol not yet found, continue reading + buf.extend_from_slice(available); + + let used = available.len(); self $(.$reader)? .consume(used); read += used; } From 886e7e97f433b9a47643f4a375b7c31e2bb8321b Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 6 Jun 2024 21:49:18 +0500 Subject: [PATCH 5/9] Add reusable parser for XML element and use it internally --- Changelog.md | 2 + src/reader/async_tokio.rs | 4 +- src/reader/buffered_reader.rs | 13 ++-- src/reader/element.rs | 113 ++++++++++++++++++++++++++++++++++ src/reader/mod.rs | 36 +---------- src/reader/slice_reader.rs | 13 ++-- 6 files changed, 132 insertions(+), 49 deletions(-) create mode 100644 src/reader/element.rs diff --git a/Changelog.md b/Changelog.md index 08be8fdd..71cd2491 100644 --- a/Changelog.md +++ b/Changelog.md @@ -43,6 +43,7 @@ resolve predefined entities. - `quick_xml::escape::resolve_xml_entity` - `quick_xml::escape::resolve_html5_entity` - [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`. +- [#754]: Added parser for elements: `quick_xml::reader::ElementParser`. ### Bug Fixes @@ -101,6 +102,7 @@ resolve predefined entities. [#743]: https://github.com/tafia/quick-xml/pull/743 [#748]: https://github.com/tafia/quick-xml/pull/748 [#753]: https://github.com/tafia/quick-xml/pull/753 +[#754]: https://github.com/tafia/quick-xml/pull/754 [`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html [`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html [`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 1cdab220..7337540f 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -8,9 +8,7 @@ use crate::errors::{Error, Result, SyntaxError}; use crate::events::Event; use crate::name::{QName, ResolveResult}; use crate::reader::buffered_reader::impl_buffered_source; -use crate::reader::{ - is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span, -}; +use crate::reader::{is_whitespace, BangType, ElementParser, NsReader, ParseState, Reader, Span}; /// A struct for read XML asynchronously from an [`AsyncBufRead`]. /// diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 58bb24a2..182e83b7 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -8,7 +8,7 @@ use std::path::Path; use crate::errors::{Error, Result, SyntaxError}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; +use crate::reader::{is_whitespace, BangType, ElementParser, Reader, Span, XmlSource}; macro_rules! impl_buffered_source { ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => { @@ -190,7 +190,7 @@ macro_rules! impl_buffered_source { buf: &'b mut Vec, position: &mut usize, ) -> Result<&'b [u8]> { - let mut state = ReadElementState::Elem; + let mut parser = ElementParser::default(); let mut read = 0; let start = buf.len(); @@ -198,11 +198,12 @@ macro_rules! impl_buffered_source { match self $(.$reader)? .fill_buf() $(.$await)? { Ok(n) if n.is_empty() => break, Ok(available) => { - if let Some((consumed, used)) = state.change(available) { - buf.extend_from_slice(consumed); + if let Some(used) = parser.feed(available) { + buf.extend_from_slice(&available[..used]); - self $(.$reader)? .consume(used); - read += used; + // +1 for `>` which we do not include + self $(.$reader)? .consume(used + 1); + read += used + 1; // Position now just after the `>` symbol *position += read; diff --git a/src/reader/element.rs b/src/reader/element.rs new file mode 100644 index 00000000..e5d14e7c --- /dev/null +++ b/src/reader/element.rs @@ -0,0 +1,113 @@ +//! Contains a parser for an XML element. + +/// A parser that search a `>` symbol in the slice outside of quoted regions. +/// +/// The parser considers two quoted regions: a double-quoted (`"..."`) and +/// a single-quoted (`'...'`) region. Matches found inside those regions are not +/// considered as results. Each region starts and ends by its quote symbol, +/// which cannot be escaped (but can be encoded as XML character entity or named +/// entity. Anyway, that encoding does not contain literal quotes). +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position of +/// found symbol. If search is unsuccessful, a [`None`] will be returned. You +/// typically would expect positive result of search, so that you should feed +/// new data until you get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// # Example +/// +/// ``` +/// # use quick_xml::reader::ElementParser; +/// # use pretty_assertions::assert_eq; +/// let mut parser = ElementParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b"and the text follow..."), Some(8)); +/// // ^ ^ +/// // 0 8 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ElementParser { + /// The initial state (inside element, but outside of attribute value). + Outside, + /// Inside a single-quoted region (`'...'`). + SingleQ, + /// Inside a double-quoted region (`"..."`). + DoubleQ, +} + +impl ElementParser { + /// Returns number of consumed bytes or `None` if `>` was not found in `bytes`. + #[inline] + pub fn feed(&mut self, bytes: &[u8]) -> Option { + for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) { + *self = match (*self, bytes[i]) { + // only allowed to match `>` while we are in state `Outside` + (Self::Outside, b'>') => return Some(i), + (Self::Outside, b'\'') => Self::SingleQ, + (Self::Outside, b'\"') => Self::DoubleQ, + + // the only end_byte that gets us out if the same character + (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Outside, + + // all other bytes: no state change + _ => continue, + }; + } + None + } +} + +impl Default for ElementParser { + #[inline] + fn default() -> Self { + Self::Outside + } +} + +#[test] +fn parse() { + use pretty_assertions::assert_eq; + use ElementParser::*; + + /// Returns `Ok(pos)` with the position in the buffer where element is ended. + /// + /// Returns `Err(internal_state)` if parsing does not done yet. + fn parse_element(bytes: &[u8], mut parser: ElementParser) -> Result { + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser), + } + } + + assert_eq!(parse_element(b"", Outside), Err(Outside)); + assert_eq!(parse_element(b"", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b"", DoubleQ), Err(DoubleQ)); + + assert_eq!(parse_element(b"'", Outside), Err(SingleQ)); + assert_eq!(parse_element(b"'", SingleQ), Err(Outside)); + assert_eq!(parse_element(b"'", DoubleQ), Err(DoubleQ)); + + assert_eq!(parse_element(b"\"", Outside), Err(DoubleQ)); + assert_eq!(parse_element(b"\"", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b"\"", DoubleQ), Err(Outside)); + + assert_eq!(parse_element(b">", Outside), Ok(0)); + assert_eq!(parse_element(b">", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b">", DoubleQ), Err(DoubleQ)); + + assert_eq!(parse_element(b"''>", Outside), Ok(2)); + assert_eq!(parse_element(b"''>", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b"''>", DoubleQ), Err(DoubleQ)); +} diff --git a/src/reader/mod.rs b/src/reader/mod.rs index d5cfa237..97b21e06 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -426,11 +426,13 @@ macro_rules! read_to_end { #[cfg(feature = "async-tokio")] mod async_tokio; mod buffered_reader; +mod element; mod ns_reader; mod pi; mod slice_reader; mod state; +pub use element::ElementParser; pub use ns_reader::NsReader; pub use pi::PiParser; @@ -986,40 +988,6 @@ impl BangType { } } -/// State machine for the [`XmlSource::read_element`] -#[derive(Clone, Copy)] -enum ReadElementState { - /// The initial state (inside element, but outside of attribute value) - Elem, - /// Inside a single-quoted attribute value - SingleQ, - /// Inside a double-quoted attribute value - DoubleQ, -} -impl ReadElementState { - /// Changes state by analyzing part of input. - /// Returns a tuple with part of chunk up to element closing symbol `>` - /// and a position after that symbol or `None` if such symbol was not found - #[inline(always)] - fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { - for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) { - *self = match (*self, chunk[i]) { - // only allowed to match `>` while we are in state `Elem` - (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)), - (Self::Elem, b'\'') => Self::SingleQ, - (Self::Elem, b'\"') => Self::DoubleQ, - - // the only end_byte that gets us out if the same character - (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem, - - // all other bytes: no state change - _ => *self, - }; - } - None - } -} - /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) #[inline] pub(crate) const fn is_whitespace(b: u8) -> bool { diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index ad2925b3..e6e89175 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -12,7 +12,7 @@ use encoding_rs::{Encoding, UTF_8}; use crate::errors::{Error, Result, SyntaxError}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, PiParser, ReadElementState, Reader, Span, XmlSource}; +use crate::reader::{is_whitespace, BangType, ElementParser, PiParser, Reader, Span, XmlSource}; /// This is an implementation for reading from a `&[u8]` as underlying byte stream. /// This implementation supports not using an intermediate buffer as the byte slice @@ -312,12 +312,13 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<&'a [u8]> { - let mut state = ReadElementState::Elem; + let mut parser = ElementParser::default(); - if let Some((bytes, i)) = state.change(self) { - // Position now just after the `>` symbol - *position += i; - *self = &self[i..]; + if let Some(i) = parser.feed(self) { + // +1 for `>` which we do not include + *position += i + 1; + let bytes = &self[..i]; + *self = &self[i + 1..]; return Ok(bytes); } From 23dc390172476e30b42f4be65c41cb431db34411 Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 6 Jun 2024 23:04:23 +0500 Subject: [PATCH 6/9] Rewrite read_element like read_pi --- src/reader/buffered_reader.rs | 39 +++++++++++++++++------------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 182e83b7..29aebf7a 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -195,34 +195,33 @@ macro_rules! impl_buffered_source { let start = buf.len(); loop { - match self $(.$reader)? .fill_buf() $(.$await)? { + let available = match self $(.$reader)? .fill_buf() $(.$await)? { Ok(n) if n.is_empty() => break, - Ok(available) => { - if let Some(used) = parser.feed(available) { + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e.into())); + } + }; + + if let Some(used) = parser.feed(available) { buf.extend_from_slice(&available[..used]); // +1 for `>` which we do not include self $(.$reader)? .consume(used + 1); read += used + 1; - // Position now just after the `>` symbol - *position += read; - return Ok(&buf[start..]); - } else { - // The `>` symbol not yet found, continue reading - buf.extend_from_slice(available); + // Position now just after the `>` symbol + *position += read; + return Ok(&buf[start..]); + } - let used = available.len(); - self $(.$reader)? .consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - }; + // The `>` symbol not yet found, continue reading + buf.extend_from_slice(available); + let used = available.len(); + self $(.$reader)? .consume(used); + read += used; } *position += read; From 80e3b0a8eff6c68f158ee196c27b6559c9f887e8 Mon Sep 17 00:00:00 2001 From: Mingun Date: Fri, 7 Jun 2024 02:15:26 +0500 Subject: [PATCH 7/9] Generalize reading methods of PI and element They are identical except different type of parser used. --- src/reader/async_tokio.rs | 4 +- src/reader/buffered_reader.rs | 55 ++-------------------- src/reader/element.rs | 15 ++++-- src/reader/mod.rs | 86 ++++++++++++++++++----------------- src/reader/pi.rs | 16 +++++-- src/reader/slice_reader.rs | 28 +++--------- 6 files changed, 84 insertions(+), 120 deletions(-) diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 7337540f..8e56346b 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -8,7 +8,9 @@ use crate::errors::{Error, Result, SyntaxError}; use crate::events::Event; use crate::name::{QName, ResolveResult}; use crate::reader::buffered_reader::impl_buffered_source; -use crate::reader::{is_whitespace, BangType, ElementParser, NsReader, ParseState, Reader, Span}; +use crate::reader::{ + is_whitespace, BangType, ElementParser, NsReader, ParseState, Parser, PiParser, Reader, Span, +}; /// A struct for read XML asynchronously from an [`AsyncBufRead`]. /// diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 29aebf7a..8585c7c4 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -5,10 +5,10 @@ use std::fs::File; use std::io::{self, BufRead, BufReader}; use std::path::Path; -use crate::errors::{Error, Result, SyntaxError}; +use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ElementParser, Reader, Span, XmlSource}; +use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource}; macro_rules! impl_buffered_source { ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => { @@ -91,13 +91,12 @@ macro_rules! impl_buffered_source { Ok((&buf[start..], done)) } - $($async)? fn read_pi $(<$lf>)? ( + $($async)? fn read_with<$($lf,)? P: Parser>( &mut self, + mut parser: P, buf: &'b mut Vec, position: &mut usize, ) -> Result<&'b [u8]> { - let mut parser = super::PiParser::default(); - let mut read = 0; let start = buf.len(); loop { @@ -131,7 +130,7 @@ macro_rules! impl_buffered_source { } *position += read; - Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) + Err(Error::Syntax(P::eof_error())) } $($async)? fn read_bang_element $(<$lf>)? ( @@ -184,50 +183,6 @@ macro_rules! impl_buffered_source { Err(bang_type.to_err()) } - #[inline] - $($async)? fn read_element $(<$lf>)? ( - &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result<&'b [u8]> { - let mut parser = ElementParser::default(); - let mut read = 0; - - let start = buf.len(); - loop { - let available = match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => break, - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - }; - - if let Some(used) = parser.feed(available) { - buf.extend_from_slice(&available[..used]); - - // +1 for `>` which we do not include - self $(.$reader)? .consume(used + 1); - read += used + 1; - - // Position now just after the `>` symbol - *position += read; - return Ok(&buf[start..]); - } - - // The `>` symbol not yet found, continue reading - buf.extend_from_slice(available); - let used = available.len(); - self $(.$reader)? .consume(used); - read += used; - } - - *position += read; - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { loop { break match self $(.$reader)? .fill_buf() $(.$await)? { diff --git a/src/reader/element.rs b/src/reader/element.rs index e5d14e7c..323a8311 100644 --- a/src/reader/element.rs +++ b/src/reader/element.rs @@ -1,5 +1,8 @@ //! Contains a parser for an XML element. +use crate::errors::SyntaxError; +use crate::reader::Parser; + /// A parser that search a `>` symbol in the slice outside of quoted regions. /// /// The parser considers two quoted regions: a double-quoted (`"..."`) and @@ -21,8 +24,9 @@ /// # Example /// /// ``` -/// # use quick_xml::reader::ElementParser; /// # use pretty_assertions::assert_eq; +/// use quick_xml::reader::{ElementParser, Parser}; +/// /// let mut parser = ElementParser::default(); /// /// // Parse `and the text follow...` @@ -47,10 +51,10 @@ pub enum ElementParser { DoubleQ, } -impl ElementParser { +impl Parser for ElementParser { /// Returns number of consumed bytes or `None` if `>` was not found in `bytes`. #[inline] - pub fn feed(&mut self, bytes: &[u8]) -> Option { + fn feed(&mut self, bytes: &[u8]) -> Option { for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) { *self = match (*self, bytes[i]) { // only allowed to match `>` while we are in state `Outside` @@ -67,6 +71,11 @@ impl ElementParser { } None } + + #[inline] + fn eof_error() -> SyntaxError { + SyntaxError::UnclosedTag + } } impl Default for ElementParser { diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 97b21e06..0301c8c2 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -361,7 +361,7 @@ macro_rules! read_until_close { }, // ` match $reader - .read_pi($buf, &mut $self.state.offset) + .read_with(PiParser::default(), $buf, &mut $self.state.offset) $(.$await)? { Ok(bytes) => $self.state.emit_question_mark(bytes), @@ -374,7 +374,7 @@ macro_rules! read_until_close { }, // `<...` - opening or self-closed tag Ok(Some(_)) => match $reader - .read_element($buf, &mut $self.state.offset) + .read_with(ElementParser::default(), $buf, &mut $self.state.offset) $(.$await)? { Ok(bytes) => $self.state.emit_start(bytes), @@ -763,6 +763,26 @@ impl Reader { //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Used to decouple reading of data from data source and parsing XML structure from it. +/// This is a state preserved between getting chunks of bytes from the reader. +/// +/// This trait is implemented for every parser that processes piece of XML grammar. +pub trait Parser { + /// Process new data and try to determine end of the parsed thing. + /// + /// Returns position of the end of thing in `bytes` in case of successful search + /// and `None` otherwise. + /// + /// # Parameters + /// - `bytes`: a slice to find the end of a thing. + /// Should contain text in ASCII-compatible encoding + fn feed(&mut self, bytes: &[u8]) -> Option; + + /// Returns parse error produced by this parser in case of reaching end of + /// input without finding the end of a parsed thing. + fn eof_error() -> SyntaxError; +} + /// Represents an input for a reader that can return borrowed data. /// /// There are two implementors of this trait: generic one that read data from @@ -821,20 +841,25 @@ trait XmlSource<'r, B> { /// Read input until processing instruction is finished. /// - /// This method expect that ``), - /// which does not include into result (`?` at the end included). + /// Returns a slice of data read up to the end of the thing being parsed. + /// The end of thing and the returned content is determined by the used parser. /// - /// If input (`Self`) is exhausted and nothing was read, returns `None`. + /// If input (`Self`) is exhausted and no bytes was read, or if the specified + /// parser could not find the ending sequence of the thing, returns `SyntaxError`. /// /// # Parameters /// - `buf`: Buffer that could be filled from an input (`Self`) and /// from which [events] could borrow their data /// - `position`: Will be increased by amount of bytes consumed /// + /// A `P` type parameter is used to preserve state between calls to the underlying + /// reader which provides bytes fed into the parser. /// [events]: crate::events::Event - fn read_pi(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>; + fn read_with

(&mut self, parser: P, buf: B, position: &mut usize) -> Result<&'r [u8]> + where + P: Parser; /// Read input until comment or CDATA is finished. /// @@ -853,30 +878,6 @@ trait XmlSource<'r, B> { /// [events]: crate::events::Event fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result<(BangType, &'r [u8])>; - /// Read input until XML element is closed by approaching a `>` symbol. - /// Returns a buffer that contains a data between `<` and `>` or - /// [`SyntaxError::UnclosedTag`] if end-of-input was reached before reading `>`. - /// - /// Derived from `read_until`, but modified to handle XML attributes - /// using a minimal state machine. - /// - /// Attribute values are [defined] as follows: - /// ```plain - /// AttValue := '"' (([^<&"]) | Reference)* '"' - /// | "'" (([^<&']) | Reference)* "'" - /// ``` - /// (`Reference` is something like `"`, but we don't care about - /// escaped characters at this level) - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue - /// [events]: crate::events::Event - fn read_element(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>; - /// Consume and discard all the whitespace until the next non-whitespace /// character or EOF. /// @@ -1510,6 +1511,7 @@ mod test { mod read_element { use super::*; use crate::errors::{Error, SyntaxError}; + use crate::reader::ElementParser; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1521,7 +1523,7 @@ mod test { let mut input = b"".as_ref(); // ^= 1 - match $source(&mut input).read_element(buf, &mut position) $(.$await)? { + match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? { Err(Error::Syntax(SyntaxError::UnclosedTag)) => {} x => panic!( "Expected `Err(Syntax(UnclosedTag))`, but got `{:?}`", @@ -1543,7 +1545,7 @@ mod test { // ^= 2 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b"") ); assert_eq!(position, 2); @@ -1557,7 +1559,7 @@ mod test { // ^= 5 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b"tag") ); assert_eq!(position, 5); @@ -1571,7 +1573,7 @@ mod test { // ^= 3 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b":") ); assert_eq!(position, 3); @@ -1585,7 +1587,7 @@ mod test { // ^= 6 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b":tag") ); assert_eq!(position, 6); @@ -1599,7 +1601,7 @@ mod test { // ^= 39 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#) ); assert_eq!(position, 39); @@ -1618,7 +1620,7 @@ mod test { // ^= 3 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b"/") ); assert_eq!(position, 3); @@ -1632,7 +1634,7 @@ mod test { // ^= 6 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b"tag/") ); assert_eq!(position, 6); @@ -1646,7 +1648,7 @@ mod test { // ^= 4 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b":/") ); assert_eq!(position, 4); @@ -1660,7 +1662,7 @@ mod test { // ^= 7 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b":tag/") ); assert_eq!(position, 7); @@ -1674,7 +1676,7 @@ mod test { // ^= 42 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#) ); assert_eq!(position, 42); diff --git a/src/reader/pi.rs b/src/reader/pi.rs index 1b44596c..24e33d79 100644 --- a/src/reader/pi.rs +++ b/src/reader/pi.rs @@ -1,5 +1,8 @@ //! Contains a parser for an XML processing instruction. +use crate::errors::SyntaxError; +use crate::reader::Parser; + /// A parser that search a `?>` sequence in the slice. /// /// To use a parser create an instance of parser and [`feed`] data into it. @@ -15,8 +18,9 @@ /// # Example /// /// ``` -/// # use quick_xml::reader::PiParser; /// # use pretty_assertions::assert_eq; +/// use quick_xml::reader::{Parser, PiParser}; +/// /// let mut parser = PiParser::default(); /// /// // Parse ` and ?' inside?>and the text follow...` @@ -38,7 +42,7 @@ pub struct PiParser( pub bool, ); -impl PiParser { +impl Parser for PiParser { /// Determines the end position of a processing instruction in the provided slice. /// Processing instruction ends on the first occurrence of `?>` which cannot be /// escaped. @@ -53,7 +57,8 @@ impl PiParser { /// Should contain text in ASCII-compatible encoding /// /// [Section 2.6]: https://www.w3.org/TR/xml11/#sec-pi - pub fn feed(&mut self, bytes: &[u8]) -> Option { + #[inline] + fn feed(&mut self, bytes: &[u8]) -> Option { for i in memchr::memchr_iter(b'>', bytes) { match i { 0 if self.0 => return Some(0), @@ -65,6 +70,11 @@ impl PiParser { self.0 = bytes.last().copied() == Some(b'?'); None } + + #[inline] + fn eof_error() -> SyntaxError { + SyntaxError::UnclosedPIOrXmlDecl + } } #[test] diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index e6e89175..8ad7e702 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -9,10 +9,10 @@ use crate::reader::EncodingRef; #[cfg(feature = "encoding")] use encoding_rs::{Encoding, UTF_8}; -use crate::errors::{Error, Result, SyntaxError}; +use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ElementParser, PiParser, Reader, Span, XmlSource}; +use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource}; /// This is an implementation for reading from a `&[u8]` as underlying byte stream. /// This implementation supports not using an intermediate buffer as the byte slice @@ -275,9 +275,10 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } } - fn read_pi(&mut self, _buf: (), position: &mut usize) -> Result<&'a [u8]> { - let mut parser = PiParser::default(); - + fn read_with

(&mut self, mut parser: P, _buf: (), position: &mut usize) -> Result<&'a [u8]> + where + P: Parser, + { if let Some(i) = parser.feed(self) { // +1 for `>` which we do not include *position += i + 1; @@ -287,7 +288,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } *position += self.len(); - Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) + Err(Error::Syntax(P::eof_error())) } fn read_bang_element( @@ -311,21 +312,6 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { Err(bang_type.to_err()) } - fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<&'a [u8]> { - let mut parser = ElementParser::default(); - - if let Some(i) = parser.feed(self) { - // +1 for `>` which we do not include - *position += i + 1; - let bytes = &self[..i]; - *self = &self[i + 1..]; - return Ok(bytes); - } - - *position += self.len(); - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { let whitespaces = self .iter() From 31c9359cb89443cb7c3b41f19b446882af5ef78e Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 6 Jun 2024 21:03:17 +0500 Subject: [PATCH 8/9] Increase position outside of XmlSource::skip_one --- src/reader/buffered_reader.rs | 6 ++---- src/reader/mod.rs | 7 ++++--- src/reader/slice_reader.rs | 3 +-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 8585c7c4..e40a1e98 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -202,13 +202,12 @@ macro_rules! impl_buffered_source { } } - $($async)? fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + $($async)? fn skip_one(&mut self, byte: u8) -> Result { // search byte must be within the ascii range debug_assert!(byte.is_ascii()); match self.peek_one() $(.$await)? ? { Some(b) if b == byte => { - *position += 1; self $(.$reader)? .consume(1); Ok(true) } @@ -219,8 +218,7 @@ macro_rules! impl_buffered_source { $($async)? fn peek_one(&mut self) -> Result> { loop { break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => Ok(None), - Ok(n) => Ok(Some(n[0])), + Ok(n) => Ok(n.first().cloned()), Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, Err(e) => Err(Error::Io(e.into())), }; diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 0301c8c2..ff885d50 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -279,7 +279,8 @@ macro_rules! read_until_open { } // If we already at the `<` symbol, do not try to return an empty Text event - if $reader.skip_one(b'<', &mut $self.state.offset) $(.$await)? ? { + if $reader.skip_one(b'<') $(.$await)? ? { + $self.state.offset += 1; $self.state.state = ParseState::OpenedTag; // Pass $buf to the next next iteration of parsing loop return Ok(Err($buf)); @@ -889,8 +890,8 @@ trait XmlSource<'r, B> { /// `true` if it matched. /// /// # Parameters - /// - `position`: Will be increased by 1 if byte is matched - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result; + /// - `byte`: Character to skip + fn skip_one(&mut self, byte: u8) -> Result; /// Return one character without consuming it, so that future `read_*` calls /// will still include it. On EOF, return `None`. diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 8ad7e702..6dd5546c 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -322,12 +322,11 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { Ok(()) } - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + fn skip_one(&mut self, byte: u8) -> Result { // search byte must be within the ascii range debug_assert!(byte.is_ascii()); if self.first() == Some(&byte) { *self = &self[1..]; - *position += 1; Ok(true) } else { Ok(false) From 86bc3cd300a6d7ba7ec71334cc300d7f88d617a6 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sat, 8 Jun 2024 15:12:32 +0500 Subject: [PATCH 9/9] Add #[inline] to methods implementing XmlSource MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Related: #678 All methods called only once or two and inlining them in most cases increases performance of our benchmarks: > critcmp master element-parser -t 5 group element-parser master ----- -------------- ------ NsReader::read_resolved_event_into/trim_text = false 1.00 398.9±6.30µs ? ?/sec 1.05 419.6±7.94µs ? ?/sec NsReader::read_resolved_event_into/trim_text = true 1.00 382.1±7.06µs ? ?/sec 1.06 404.0±7.44µs ? ?/sec One event/CData 1.00 56.3±0.97ns ? ?/sec 1.21 68.1±1.35ns ? ?/sec One event/Comment 1.00 141.2±2.52ns ? ?/sec 1.14 161.4±2.79ns ? ?/sec decode_and_parse_document_with_namespaces/rpm_filelists.xml 1.00 95.1±1.45µs 115.5 MB/sec 1.07 102.2±1.65µs 107.5 MB/sec escape_text/escaped_chars_long 1.42 1806.4±34.20ns ? ?/sec 1.00 1275.0±23.98ns ? ?/sec escape_text/escaped_chars_short 1.00 491.5±8.35ns ? ?/sec 1.07 526.6±10.80ns ? ?/sec escape_text/no_chars_to_escape_long 2.06 1831.1±36.31ns ? ?/sec 1.00 887.1±17.00ns ? ?/sec parse_document_nocopy_with_namespaces/libreoffice_document.fodt 1.00 507.2±8.56µs 107.6 MB/sec 1.08 546.2±10.20µs 100.0 MB/sec parse_document_nocopy_with_namespaces/rpm_filelists.xml 1.00 87.2±1.64µs 126.0 MB/sec 1.14 99.2±1.74µs 110.7 MB/sec parse_document_nocopy_with_namespaces/rpm_other.xml 1.00 139.6±2.83µs 158.5 MB/sec 1.07 148.7±2.71µs 148.9 MB/sec parse_document_nocopy_with_namespaces/rpm_primary.xml 1.00 190.5±3.43µs 106.4 MB/sec 1.09 207.9±3.79µs 97.5 MB/sec parse_document_nocopy_with_namespaces/rpm_primary2.xml 1.00 61.7±1.10µs 116.2 MB/sec 1.09 67.5±1.28µs 106.2 MB/sec parse_document_nocopy_with_namespaces/sample_1.xml 1.00 10.5±0.20µs 105.0 MB/sec 1.06 11.1±0.21µs 99.3 MB/sec parse_document_nocopy_with_namespaces/sample_ns.xml 1.00 8.4±0.16µs 86.5 MB/sec 1.08 9.0±0.18µs 80.0 MB/sec parse_document_nocopy_with_namespaces/sample_rss.xml 1.00 786.4±13.46µs 239.8 MB/sec 1.09 859.9±12.82µs 219.3 MB/sec parse_document_nocopy_with_namespaces/test_writer_ident.xml 1.00 29.0±0.55µs 146.4 MB/sec 1.06 30.8±0.55µs 138.0 MB/sec read_event/trim_text = false 1.00 199.3±3.59µs ? ?/sec 1.10 218.5±3.98µs ? ?/sec read_event/trim_text = true 1.00 190.4±3.76µs ? ?/sec 1.11 211.7±4.11µs ? ?/sec unescape_text/no_chars_to_unescape_short 1.00 11.8±0.21ns ? ?/sec 1.06 12.4±0.23ns ? ?/sec --- src/reader/buffered_reader.rs | 7 +++++++ src/reader/slice_reader.rs | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index e40a1e98..1cbe3681 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -13,6 +13,7 @@ use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource}; macro_rules! impl_buffered_source { ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => { #[cfg(not(feature = "encoding"))] + #[inline] $($async)? fn remove_utf8_bom(&mut self) -> Result<()> { use crate::encoding::UTF8_BOM; @@ -31,6 +32,7 @@ macro_rules! impl_buffered_source { } #[cfg(feature = "encoding")] + #[inline] $($async)? fn detect_encoding(&mut self) -> Result> { loop { break match self $(.$reader)? .fill_buf() $(.$await)? { @@ -91,6 +93,7 @@ macro_rules! impl_buffered_source { Ok((&buf[start..], done)) } + #[inline] $($async)? fn read_with<$($lf,)? P: Parser>( &mut self, mut parser: P, @@ -133,6 +136,7 @@ macro_rules! impl_buffered_source { Err(Error::Syntax(P::eof_error())) } + #[inline] $($async)? fn read_bang_element $(<$lf>)? ( &mut self, buf: &'b mut Vec, @@ -183,6 +187,7 @@ macro_rules! impl_buffered_source { Err(bang_type.to_err()) } + #[inline] $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { loop { break match self $(.$reader)? .fill_buf() $(.$await)? { @@ -202,6 +207,7 @@ macro_rules! impl_buffered_source { } } + #[inline] $($async)? fn skip_one(&mut self, byte: u8) -> Result { // search byte must be within the ascii range debug_assert!(byte.is_ascii()); @@ -215,6 +221,7 @@ macro_rules! impl_buffered_source { } } + #[inline] $($async)? fn peek_one(&mut self) -> Result> { loop { break match self $(.$reader)? .fill_buf() $(.$await)? { diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 6dd5546c..d8da376d 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -237,6 +237,7 @@ impl<'a> Reader<&'a [u8]> { /// that will be borrowed by events. This implementation provides a zero-copy deserialization impl<'a> XmlSource<'a, ()> for &'a [u8] { #[cfg(not(feature = "encoding"))] + #[inline] fn remove_utf8_bom(&mut self) -> Result<()> { if self.starts_with(crate::encoding::UTF8_BOM) { *self = &self[crate::encoding::UTF8_BOM.len()..]; @@ -245,6 +246,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } #[cfg(feature = "encoding")] + #[inline] fn detect_encoding(&mut self) -> Result> { if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) { *self = &self[bom_len..]; @@ -253,6 +255,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { Ok(None) } + #[inline] fn read_bytes_until( &mut self, byte: u8, @@ -275,6 +278,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } } + #[inline] fn read_with

(&mut self, mut parser: P, _buf: (), position: &mut usize) -> Result<&'a [u8]> where P: Parser, @@ -291,6 +295,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { Err(Error::Syntax(P::eof_error())) } + #[inline] fn read_bang_element( &mut self, _buf: (), @@ -312,6 +317,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { Err(bang_type.to_err()) } + #[inline] fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { let whitespaces = self .iter() @@ -322,6 +328,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { Ok(()) } + #[inline] fn skip_one(&mut self, byte: u8) -> Result { // search byte must be within the ascii range debug_assert!(byte.is_ascii()); @@ -333,6 +340,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } } + #[inline] fn peek_one(&mut self) -> Result> { Ok(self.first().copied()) }