diff --git a/Changelog.md b/Changelog.md index 08be8fdd..71cd2491 100644 --- a/Changelog.md +++ b/Changelog.md @@ -43,6 +43,7 @@ resolve predefined entities. - `quick_xml::escape::resolve_xml_entity` - `quick_xml::escape::resolve_html5_entity` - [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`. +- [#754]: Added parser for elements: `quick_xml::reader::ElementParser`. ### Bug Fixes @@ -101,6 +102,7 @@ resolve predefined entities. [#743]: https://github.com/tafia/quick-xml/pull/743 [#748]: https://github.com/tafia/quick-xml/pull/748 [#753]: https://github.com/tafia/quick-xml/pull/753 +[#754]: https://github.com/tafia/quick-xml/pull/754 [`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html [`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html [`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 1cdab220..8e56346b 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -9,7 +9,7 @@ use crate::events::Event; use crate::name::{QName, ResolveResult}; use crate::reader::buffered_reader::impl_buffered_source; use crate::reader::{ - is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span, + is_whitespace, BangType, ElementParser, NsReader, ParseState, Parser, PiParser, Reader, Span, }; /// A struct for read XML asynchronously from an [`AsyncBufRead`]. diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 6436de3a..1cbe3681 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -5,14 +5,15 @@ use std::fs::File; use std::io::{self, BufRead, BufReader}; use std::path::Path; -use crate::errors::{Error, Result, SyntaxError}; +use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource}; +use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource}; macro_rules! impl_buffered_source { ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => { #[cfg(not(feature = "encoding"))] + #[inline] $($async)? fn remove_utf8_bom(&mut self) -> Result<()> { use crate::encoding::UTF8_BOM; @@ -31,6 +32,7 @@ macro_rules! impl_buffered_source { } #[cfg(feature = "encoding")] + #[inline] $($async)? fn detect_encoding(&mut self) -> Result> { loop { break match self $(.$reader)? .fill_buf() $(.$await)? { @@ -91,49 +93,50 @@ macro_rules! impl_buffered_source { Ok((&buf[start..], done)) } - $($async)? fn read_pi $(<$lf>)? ( + #[inline] + $($async)? fn read_with<$($lf,)? P: Parser>( &mut self, + mut parser: P, buf: &'b mut Vec, position: &mut usize, - ) -> Result<(&'b [u8], bool)> { - let mut parser = super::PiParser::default(); - + ) -> Result<&'b [u8]> { let mut read = 0; - let mut done = false; let start = buf.len(); - while !done { - let used = { - let available = match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => break, - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - }; - - match parser.feed(available) { - Some(i) => { - // We does not include `>` in data - buf.extend_from_slice(&available[..i - 1]); - done = true; - i - } - None => { - buf.extend_from_slice(available); - available.len() - } + loop { + let available = match self $(.$reader)? .fill_buf() $(.$await)? { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e.into())); } }; + + if let Some(i) = parser.feed(available) { + buf.extend_from_slice(&available[..i]); + + // +1 for `>` which we do not include + self $(.$reader)? .consume(i + 1); + read += i + 1; + + *position += read; + return Ok(&buf[start..]); + } + + // The `>` symbol not yet found, continue reading + buf.extend_from_slice(available); + + let used = available.len(); self $(.$reader)? .consume(used); read += used; } - *position += read; - Ok((&buf[start..], done)) + *position += read; + Err(Error::Syntax(P::eof_error())) } + #[inline] $($async)? fn read_bang_element $(<$lf>)? ( &mut self, buf: &'b mut Vec, @@ -185,49 +188,6 @@ macro_rules! impl_buffered_source { } #[inline] - $($async)? fn read_element $(<$lf>)? ( - &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result<&'b [u8]> { - let mut state = ReadElementState::Elem; - let mut read = 0; - - let start = buf.len(); - loop { - match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => break, - Ok(available) => { - if let Some((consumed, used)) = state.change(available) { - buf.extend_from_slice(consumed); - - self $(.$reader)? .consume(used); - read += used; - - // Position now just after the `>` symbol - *position += read; - return Ok(&buf[start..]); - } else { - // The `>` symbol not yet found, continue reading - buf.extend_from_slice(available); - - let used = available.len(); - self $(.$reader)? .consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e.into())); - } - }; - } - - *position += read; - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { loop { break match self $(.$reader)? .fill_buf() $(.$await)? { @@ -247,13 +207,13 @@ macro_rules! impl_buffered_source { } } - $($async)? fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + #[inline] + $($async)? fn skip_one(&mut self, byte: u8) -> Result { // search byte must be within the ascii range debug_assert!(byte.is_ascii()); match self.peek_one() $(.$await)? ? { Some(b) if b == byte => { - *position += 1; self $(.$reader)? .consume(1); Ok(true) } @@ -261,11 +221,11 @@ macro_rules! impl_buffered_source { } } + #[inline] $($async)? fn peek_one(&mut self) -> Result> { loop { break match self $(.$reader)? .fill_buf() $(.$await)? { - Ok(n) if n.is_empty() => Ok(None), - Ok(n) => Ok(Some(n[0])), + Ok(n) => Ok(n.first().cloned()), Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, Err(e) => Err(Error::Io(e.into())), }; diff --git a/src/reader/element.rs b/src/reader/element.rs new file mode 100644 index 00000000..323a8311 --- /dev/null +++ b/src/reader/element.rs @@ -0,0 +1,122 @@ +//! Contains a parser for an XML element. + +use crate::errors::SyntaxError; +use crate::reader::Parser; + +/// A parser that search a `>` symbol in the slice outside of quoted regions. +/// +/// The parser considers two quoted regions: a double-quoted (`"..."`) and +/// a single-quoted (`'...'`) region. Matches found inside those regions are not +/// considered as results. Each region starts and ends by its quote symbol, +/// which cannot be escaped (but can be encoded as XML character entity or named +/// entity. Anyway, that encoding does not contain literal quotes). +/// +/// To use a parser create an instance of parser and [`feed`] data into it. +/// After successful search the parser will return [`Some`] with position of +/// found symbol. If search is unsuccessful, a [`None`] will be returned. You +/// typically would expect positive result of search, so that you should feed +/// new data until you get it. +/// +/// NOTE: after successful match the parser does not returned to the initial +/// state and should not be used anymore. Create a new parser if you want to perform +/// new search. +/// +/// # Example +/// +/// ``` +/// # use pretty_assertions::assert_eq; +/// use quick_xml::reader::{ElementParser, Parser}; +/// +/// let mut parser = ElementParser::default(); +/// +/// // Parse `and the text follow...` +/// // splitted into three chunks +/// assert_eq!(parser.feed(b"and the text follow..."), Some(8)); +/// // ^ ^ +/// // 0 8 +/// ``` +/// +/// [`feed`]: Self::feed() +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ElementParser { + /// The initial state (inside element, but outside of attribute value). + Outside, + /// Inside a single-quoted region (`'...'`). + SingleQ, + /// Inside a double-quoted region (`"..."`). + DoubleQ, +} + +impl Parser for ElementParser { + /// Returns number of consumed bytes or `None` if `>` was not found in `bytes`. + #[inline] + fn feed(&mut self, bytes: &[u8]) -> Option { + for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) { + *self = match (*self, bytes[i]) { + // only allowed to match `>` while we are in state `Outside` + (Self::Outside, b'>') => return Some(i), + (Self::Outside, b'\'') => Self::SingleQ, + (Self::Outside, b'\"') => Self::DoubleQ, + + // the only end_byte that gets us out if the same character + (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Outside, + + // all other bytes: no state change + _ => continue, + }; + } + None + } + + #[inline] + fn eof_error() -> SyntaxError { + SyntaxError::UnclosedTag + } +} + +impl Default for ElementParser { + #[inline] + fn default() -> Self { + Self::Outside + } +} + +#[test] +fn parse() { + use pretty_assertions::assert_eq; + use ElementParser::*; + + /// Returns `Ok(pos)` with the position in the buffer where element is ended. + /// + /// Returns `Err(internal_state)` if parsing does not done yet. + fn parse_element(bytes: &[u8], mut parser: ElementParser) -> Result { + match parser.feed(bytes) { + Some(i) => Ok(i), + None => Err(parser), + } + } + + assert_eq!(parse_element(b"", Outside), Err(Outside)); + assert_eq!(parse_element(b"", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b"", DoubleQ), Err(DoubleQ)); + + assert_eq!(parse_element(b"'", Outside), Err(SingleQ)); + assert_eq!(parse_element(b"'", SingleQ), Err(Outside)); + assert_eq!(parse_element(b"'", DoubleQ), Err(DoubleQ)); + + assert_eq!(parse_element(b"\"", Outside), Err(DoubleQ)); + assert_eq!(parse_element(b"\"", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b"\"", DoubleQ), Err(Outside)); + + assert_eq!(parse_element(b">", Outside), Ok(0)); + assert_eq!(parse_element(b">", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b">", DoubleQ), Err(DoubleQ)); + + assert_eq!(parse_element(b"''>", Outside), Ok(2)); + assert_eq!(parse_element(b"''>", SingleQ), Err(SingleQ)); + assert_eq!(parse_element(b"''>", DoubleQ), Err(DoubleQ)); +} diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 6de103af..ff885d50 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -279,7 +279,8 @@ macro_rules! read_until_open { } // If we already at the `<` symbol, do not try to return an empty Text event - if $reader.skip_one(b'<', &mut $self.state.offset) $(.$await)? ? { + if $reader.skip_one(b'<') $(.$await)? ? { + $self.state.offset += 1; $self.state.state = ParseState::OpenedTag; // Pass $buf to the next next iteration of parsing loop return Ok(Err($buf)); @@ -361,21 +362,20 @@ macro_rules! read_until_close { }, // ` match $reader - .read_pi($buf, &mut $self.state.offset) + .read_with(PiParser::default(), $buf, &mut $self.state.offset) $(.$await)? { - Ok((bytes, true)) => $self.state.emit_question_mark(bytes), - Ok((_, false)) => { + Ok(bytes) => $self.state.emit_question_mark(bytes), + Err(e) => { // We want to report error at `<`, but offset was increased, // so return it back (-1 for `<`) $self.state.last_error_offset = start - 1; - Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) + Err(e) } - Err(e) => Err(e), }, // `<...` - opening or self-closed tag Ok(Some(_)) => match $reader - .read_element($buf, &mut $self.state.offset) + .read_with(ElementParser::default(), $buf, &mut $self.state.offset) $(.$await)? { Ok(bytes) => $self.state.emit_start(bytes), @@ -427,11 +427,13 @@ macro_rules! read_to_end { #[cfg(feature = "async-tokio")] mod async_tokio; mod buffered_reader; +mod element; mod ns_reader; mod pi; mod slice_reader; mod state; +pub use element::ElementParser; pub use ns_reader::NsReader; pub use pi::PiParser; @@ -762,6 +764,26 @@ impl Reader { //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Used to decouple reading of data from data source and parsing XML structure from it. +/// This is a state preserved between getting chunks of bytes from the reader. +/// +/// This trait is implemented for every parser that processes piece of XML grammar. +pub trait Parser { + /// Process new data and try to determine end of the parsed thing. + /// + /// Returns position of the end of thing in `bytes` in case of successful search + /// and `None` otherwise. + /// + /// # Parameters + /// - `bytes`: a slice to find the end of a thing. + /// Should contain text in ASCII-compatible encoding + fn feed(&mut self, bytes: &[u8]) -> Option; + + /// Returns parse error produced by this parser in case of reaching end of + /// input without finding the end of a parsed thing. + fn eof_error() -> SyntaxError; +} + /// Represents an input for a reader that can return borrowed data. /// /// There are two implementors of this trait: generic one that read data from @@ -820,20 +842,25 @@ trait XmlSource<'r, B> { /// Read input until processing instruction is finished. /// - /// This method expect that ``), - /// which does not include into result (`?` at the end included). + /// Returns a slice of data read up to the end of the thing being parsed. + /// The end of thing and the returned content is determined by the used parser. /// - /// If input (`Self`) is exhausted and nothing was read, returns `None`. + /// If input (`Self`) is exhausted and no bytes was read, or if the specified + /// parser could not find the ending sequence of the thing, returns `SyntaxError`. /// /// # Parameters /// - `buf`: Buffer that could be filled from an input (`Self`) and /// from which [events] could borrow their data /// - `position`: Will be increased by amount of bytes consumed /// + /// A `P` type parameter is used to preserve state between calls to the underlying + /// reader which provides bytes fed into the parser. /// [events]: crate::events::Event - fn read_pi(&mut self, buf: B, position: &mut usize) -> Result<(&'r [u8], bool)>; + fn read_with

(&mut self, parser: P, buf: B, position: &mut usize) -> Result<&'r [u8]> + where + P: Parser; /// Read input until comment or CDATA is finished. /// @@ -852,30 +879,6 @@ trait XmlSource<'r, B> { /// [events]: crate::events::Event fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result<(BangType, &'r [u8])>; - /// Read input until XML element is closed by approaching a `>` symbol. - /// Returns a buffer that contains a data between `<` and `>` or - /// [`SyntaxError::UnclosedTag`] if end-of-input was reached before reading `>`. - /// - /// Derived from `read_until`, but modified to handle XML attributes - /// using a minimal state machine. - /// - /// Attribute values are [defined] as follows: - /// ```plain - /// AttValue := '"' (([^<&"]) | Reference)* '"' - /// | "'" (([^<&']) | Reference)* "'" - /// ``` - /// (`Reference` is something like `"`, but we don't care about - /// escaped characters at this level) - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue - /// [events]: crate::events::Event - fn read_element(&mut self, buf: B, position: &mut usize) -> Result<&'r [u8]>; - /// Consume and discard all the whitespace until the next non-whitespace /// character or EOF. /// @@ -887,8 +890,8 @@ trait XmlSource<'r, B> { /// `true` if it matched. /// /// # Parameters - /// - `position`: Will be increased by 1 if byte is matched - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result; + /// - `byte`: Character to skip + fn skip_one(&mut self, byte: u8) -> Result; /// Return one character without consuming it, so that future `read_*` calls /// will still include it. On EOF, return `None`. @@ -987,40 +990,6 @@ impl BangType { } } -/// State machine for the [`XmlSource::read_element`] -#[derive(Clone, Copy)] -enum ReadElementState { - /// The initial state (inside element, but outside of attribute value) - Elem, - /// Inside a single-quoted attribute value - SingleQ, - /// Inside a double-quoted attribute value - DoubleQ, -} -impl ReadElementState { - /// Changes state by analyzing part of input. - /// Returns a tuple with part of chunk up to element closing symbol `>` - /// and a position after that symbol or `None` if such symbol was not found - #[inline(always)] - fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { - for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) { - *self = match (*self, chunk[i]) { - // only allowed to match `>` while we are in state `Elem` - (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)), - (Self::Elem, b'\'') => Self::SingleQ, - (Self::Elem, b'\"') => Self::DoubleQ, - - // the only end_byte that gets us out if the same character - (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem, - - // all other bytes: no state change - _ => *self, - }; - } - None - } -} - /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) #[inline] pub(crate) const fn is_whitespace(b: u8) -> bool { @@ -1543,6 +1512,7 @@ mod test { mod read_element { use super::*; use crate::errors::{Error, SyntaxError}; + use crate::reader::ElementParser; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1554,7 +1524,7 @@ mod test { let mut input = b"".as_ref(); // ^= 1 - match $source(&mut input).read_element(buf, &mut position) $(.$await)? { + match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? { Err(Error::Syntax(SyntaxError::UnclosedTag)) => {} x => panic!( "Expected `Err(Syntax(UnclosedTag))`, but got `{:?}`", @@ -1576,7 +1546,7 @@ mod test { // ^= 2 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b"") ); assert_eq!(position, 2); @@ -1590,7 +1560,7 @@ mod test { // ^= 5 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b"tag") ); assert_eq!(position, 5); @@ -1604,7 +1574,7 @@ mod test { // ^= 3 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b":") ); assert_eq!(position, 3); @@ -1618,7 +1588,7 @@ mod test { // ^= 6 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b":tag") ); assert_eq!(position, 6); @@ -1632,7 +1602,7 @@ mod test { // ^= 39 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#) ); assert_eq!(position, 39); @@ -1651,7 +1621,7 @@ mod test { // ^= 3 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b"/") ); assert_eq!(position, 3); @@ -1665,7 +1635,7 @@ mod test { // ^= 6 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b"tag/") ); assert_eq!(position, 6); @@ -1679,7 +1649,7 @@ mod test { // ^= 4 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b":/") ); assert_eq!(position, 4); @@ -1693,7 +1663,7 @@ mod test { // ^= 7 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(b":tag/") ); assert_eq!(position, 7); @@ -1707,7 +1677,7 @@ mod test { // ^= 42 assert_eq!( - Bytes($source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap()), + Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()), Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#) ); assert_eq!(position, 42); diff --git a/src/reader/pi.rs b/src/reader/pi.rs index 7729b3ed..24e33d79 100644 --- a/src/reader/pi.rs +++ b/src/reader/pi.rs @@ -1,12 +1,15 @@ //! Contains a parser for an XML processing instruction. +use crate::errors::SyntaxError; +use crate::reader::Parser; + /// A parser that search a `?>` sequence in the slice. /// /// To use a parser create an instance of parser and [`feed`] data into it. /// After successful search the parser will return [`Some`] with position where /// processing instruction is ended (the position after `?>`). If search was /// unsuccessful, a [`None`] will be returned. You typically would expect positive -/// result of search, so that you should feed new data until yo'll get it. +/// result of search, so that you should feed new data until you get it. /// /// NOTE: after successful match the parser does not returned to the initial /// state and should not be used anymore. Create a new parser if you want to perform @@ -15,8 +18,9 @@ /// # Example /// /// ``` -/// # use quick_xml::reader::PiParser; /// # use pretty_assertions::assert_eq; +/// use quick_xml::reader::{Parser, PiParser}; +/// /// let mut parser = PiParser::default(); /// /// // Parse ` and ?' inside?>and the text follow...` @@ -25,9 +29,9 @@ /// // ...get new chunk of data /// assert_eq!(parser.feed(b" with = 'some > and ?"), None); /// // ...get another chunk of data -/// assert_eq!(parser.feed(b"' inside?>and the text follow..."), Some(10)); -/// // ^ ^ -/// // 0 10 +/// assert_eq!(parser.feed(b"' inside?>and the text follow..."), Some(9)); +/// // ^ ^ +/// // 0 9 /// ``` /// /// [`feed`]: Self::feed() @@ -38,7 +42,7 @@ pub struct PiParser( pub bool, ); -impl PiParser { +impl Parser for PiParser { /// Determines the end position of a processing instruction in the provided slice. /// Processing instruction ends on the first occurrence of `?>` which cannot be /// escaped. @@ -53,20 +57,24 @@ impl PiParser { /// Should contain text in ASCII-compatible encoding /// /// [Section 2.6]: https://www.w3.org/TR/xml11/#sec-pi - pub fn feed(&mut self, bytes: &[u8]) -> Option { + #[inline] + fn feed(&mut self, bytes: &[u8]) -> Option { for i in memchr::memchr_iter(b'>', bytes) { match i { - // +1 for `>` which should be included in event - 0 if self.0 => return Some(1), + 0 if self.0 => return Some(0), // If the previous byte is `?`, then we found `?>` - // +1 for `>` which should be included in event - i if i > 0 && bytes[i - 1] == b'?' => return Some(i + 1), + i if i > 0 && bytes[i - 1] == b'?' => return Some(i), _ => {} } } self.0 = bytes.last().copied() == Some(b'?'); None } + + #[inline] + fn eof_error() -> SyntaxError { + SyntaxError::UnclosedPIOrXmlDecl + } } #[test] @@ -95,11 +103,11 @@ fn pi() { assert_eq!(parse_pi(b"?", true), Err(true)); // ?|? assert_eq!(parse_pi(b">", false), Err(false)); // x|> - assert_eq!(parse_pi(b">", true), Ok(1)); // ?|> + assert_eq!(parse_pi(b">", true), Ok(0)); // ?|> - assert_eq!(parse_pi(b"?>", false), Ok(2)); // x|?> - assert_eq!(parse_pi(b"?>", true), Ok(2)); // ?|?> + assert_eq!(parse_pi(b"?>", false), Ok(1)); // x|?> + assert_eq!(parse_pi(b"?>", true), Ok(1)); // ?|?> - assert_eq!(parse_pi(b">?>", false), Ok(3)); // x|>?> - assert_eq!(parse_pi(b">?>", true), Ok(1)); // ?|>?> + assert_eq!(parse_pi(b">?>", false), Ok(2)); // x|>?> + assert_eq!(parse_pi(b">?>", true), Ok(0)); // ?|>?> } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index b618ae65..d8da376d 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -9,10 +9,10 @@ use crate::reader::EncodingRef; #[cfg(feature = "encoding")] use encoding_rs::{Encoding, UTF_8}; -use crate::errors::{Error, Result, SyntaxError}; +use crate::errors::{Error, Result}; use crate::events::Event; use crate::name::QName; -use crate::reader::{is_whitespace, BangType, PiParser, ReadElementState, Reader, Span, XmlSource}; +use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource}; /// This is an implementation for reading from a `&[u8]` as underlying byte stream. /// This implementation supports not using an intermediate buffer as the byte slice @@ -237,6 +237,7 @@ impl<'a> Reader<&'a [u8]> { /// that will be borrowed by events. This implementation provides a zero-copy deserialization impl<'a> XmlSource<'a, ()> for &'a [u8] { #[cfg(not(feature = "encoding"))] + #[inline] fn remove_utf8_bom(&mut self) -> Result<()> { if self.starts_with(crate::encoding::UTF8_BOM) { *self = &self[crate::encoding::UTF8_BOM.len()..]; @@ -245,6 +246,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } #[cfg(feature = "encoding")] + #[inline] fn detect_encoding(&mut self) -> Result> { if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) { *self = &self[bom_len..]; @@ -253,6 +255,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { Ok(None) } + #[inline] fn read_bytes_until( &mut self, byte: u8, @@ -275,23 +278,24 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } } - fn read_pi(&mut self, _buf: (), position: &mut usize) -> Result<(&'a [u8], bool)> { - let mut parser = PiParser::default(); - + #[inline] + fn read_with

(&mut self, mut parser: P, _buf: (), position: &mut usize) -> Result<&'a [u8]> + where + P: Parser, + { if let Some(i) = parser.feed(self) { - *position += i; - // We does not include `>` in data - let bytes = &self[..i - 1]; - *self = &self[i..]; - Ok((bytes, true)) - } else { - *position += self.len(); - let bytes = &self[..]; - *self = &[]; - Ok((bytes, false)) + // +1 for `>` which we do not include + *position += i + 1; + let bytes = &self[..i]; + *self = &self[i + 1..]; + return Ok(bytes); } + + *position += self.len(); + Err(Error::Syntax(P::eof_error())) } + #[inline] fn read_bang_element( &mut self, _buf: (), @@ -313,20 +317,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { Err(bang_type.to_err()) } - fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<&'a [u8]> { - let mut state = ReadElementState::Elem; - - if let Some((bytes, i)) = state.change(self) { - // Position now just after the `>` symbol - *position += i; - *self = &self[i..]; - return Ok(bytes); - } - - *position += self.len(); - Err(Error::Syntax(SyntaxError::UnclosedTag)) - } - + #[inline] fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { let whitespaces = self .iter() @@ -337,18 +328,19 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { Ok(()) } - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + #[inline] + fn skip_one(&mut self, byte: u8) -> Result { // search byte must be within the ascii range debug_assert!(byte.is_ascii()); if self.first() == Some(&byte) { *self = &self[1..]; - *position += 1; Ok(true) } else { Ok(false) } } + #[inline] fn peek_one(&mut self) -> Result> { Ok(self.first().copied()) }