From 60dc37f0ffaffb5680d9c120adec669e7351ea1a Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 15 Aug 2022 21:15:51 +0500 Subject: [PATCH] Correctly detect UTF-16 encoding even without BOM Fixes 2 tests --- src/encoding.rs | 44 +++++++------------- src/reader/async_tokio.rs | 20 +++++----- src/reader/buffered_reader.rs | 38 ++++++++++++++++-- src/reader/mod.rs | 75 ++++++++++++++++++++++++++++++----- src/reader/parser.rs | 20 +--------- src/reader/slice_reader.rs | 19 ++++++++- tests/encodings.rs | 8 ++-- 7 files changed, 149 insertions(+), 75 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index ce81f0d2..6568812f 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -99,29 +99,14 @@ pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) { - if encoding == UTF_8 && bytes.starts_with(UTF8_BOM) { - bytes.split_at(3) - } else if encoding == UTF_16LE && bytes.starts_with(UTF16_LE_BOM) { - bytes.split_at(2) - } else if encoding == UTF_16BE && bytes.starts_with(UTF16_BE_BOM) { - bytes.split_at(2) - } else { - (&[], bytes) - } -} - -#[cfg(feature = "encoding")] -pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] { - let (_, bytes) = split_at_bom(bytes, encoding); - bytes -} - /// Automatic encoding detection of XML files based using the /// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing). /// -/// If encoding is detected, `Some` is returned, otherwise `None` is returned. +/// If encoding is detected, `Some` is returned with an encoding and size of BOM +/// in bytes, if detection was performed using BOM, or zero, if detection was +/// performed without BOM. +/// +/// IF encoding was not recognized, `None` is returned. /// /// Because the [`encoding_rs`] crate supports only subset of those encodings, only /// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE. @@ -131,25 +116,26 @@ pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &' /// /// | Bytes |Detected encoding /// |-------------|------------------------------------------ -/// |`FE FF ## ##`|UTF-16, big-endian +/// | **BOM** +/// |`FE_FF_##_##`|UTF-16, big-endian /// |`FF FE ## ##`|UTF-16, little-endian /// |`EF BB BF` |UTF-8 -/// |-------------|------------------------------------------ +/// | **No BOM** /// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one) /// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one) /// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably #[cfg(feature = "encoding")] -pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { +pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> { match bytes { // with BOM - _ if bytes.starts_with(UTF16_BE_BOM) => Some(UTF_16BE), - _ if bytes.starts_with(UTF16_LE_BOM) => Some(UTF_16LE), - _ if bytes.starts_with(UTF8_BOM) => Some(UTF_8), + _ if bytes.starts_with(UTF16_BE_BOM) => Some((UTF_16BE, 2)), + _ if bytes.starts_with(UTF16_LE_BOM) => Some((UTF_16LE, 2)), + _ if bytes.starts_with(UTF8_BOM) => Some((UTF_8, 3)), // without BOM - _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 - _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2 - _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible + _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some((UTF_16BE, 0)), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some((UTF_16LE, 0)), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some((UTF_8, 0)), // Some ASCII compatible _ => None, } diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs index 07719eda..570a8814 100644 --- a/src/reader/async_tokio.rs +++ b/src/reader/async_tokio.rs @@ -86,7 +86,13 @@ impl Reader { buf: &'b mut Vec, ) -> Pin>> + 'reader>> { Box::pin(async move { - read_event_impl!(self, buf, read_until_open_async, read_until_close_async, await) + read_event_impl!( + self, buf, + TokioAdapter(&mut self.reader), + read_until_open_async, + read_until_close_async, + await + ) }) } @@ -148,15 +154,9 @@ impl Reader { Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await)) } - /// Read until '<' is found and moves reader to an `Opened` state. - /// - /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise - async fn read_until_open_async<'b>( - &mut self, - buf: &'b mut Vec, - first: bool, - ) -> Result> { - read_until_open!(self, buf, first, TokioAdapter(&mut self.reader), read_event_into_async, await) + /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event. + async fn read_until_open_async<'b>(&mut self, buf: &'b mut Vec) -> Result> { + read_until_open!(self, buf, TokioAdapter(&mut self.reader), read_event_into_async, await) } /// Private function to read until `>` is found. This function expects that diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index dea9f638..4f83b6a2 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -14,6 +14,40 @@ use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, Xml macro_rules! impl_buffered_source { ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => { + #[cfg(not(feature = "encoding"))] + $($async)? fn remove_utf8_bom(&mut self) -> Result<()> { + use crate::encoding::UTF8_BOM; + + loop { + break match self $(.$reader)? .fill_buf() $(.$await)? { + Ok(n) => { + if n.starts_with(UTF8_BOM) { + self $(.$reader)? .consume(UTF8_BOM.len()); + } + Ok(()) + }, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } + + #[cfg(feature = "encoding")] + $($async)? fn detect_encoding(&mut self) -> Result> { + loop { + break match self $(.$reader)? .fill_buf() $(.$await)? { + Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) { + self $(.$reader)? .consume(bom_len); + Ok(Some(enc)) + } else { + Ok(None) + }, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } + #[inline] $($async)? fn read_bytes_until $(<$lf>)? ( &mut self, @@ -396,6 +430,7 @@ mod test { use pretty_assertions::assert_eq; /// Checks that encoding is detected by BOM and changed after XML declaration + /// BOM indicates UTF-16LE, but XML - windows-1251 #[test] fn bom_detected() { let mut reader = @@ -403,9 +438,6 @@ mod test { let mut buf = Vec::new(); assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_into(&mut buf).unwrap(); - assert_eq!(reader.decoder().encoding(), UTF_16LE); - reader.read_event_into(&mut buf).unwrap(); assert_eq!(reader.decoder().encoding(), WINDOWS_1251); diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 837c0511..bc448300 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -139,13 +139,31 @@ macro_rules! configure_methods { macro_rules! read_event_impl { ( $self:ident, $buf:ident, + $reader:expr, $read_until_open:ident, $read_until_close:ident $(, $await:ident)? ) => {{ let event = match $self.parser.state { - ParseState::Init => $self.$read_until_open($buf, true) $(.$await)?, - ParseState::ClosedTag => $self.$read_until_open($buf, false) $(.$await)?, + ParseState::Init => { + // If encoding set explicitly, we not need to detect it. For example, + // explicit UTF-8 set automatically if Reader was created using `from_str`. + // But we still need to remove BOM for consistency with no encoding + // feature enabled path + #[cfg(feature = "encoding")] + if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? { + if $self.parser.encoding.can_be_refined() { + $self.parser.encoding = crate::reader::EncodingRef::BomDetected(encoding); + } + } + + // Removes UTF-8 BOM if it is present + #[cfg(not(feature = "encoding"))] + $reader.remove_utf8_bom() $(.$await)? ?; + + $self.$read_until_open($buf) $(.$await)? + }, + ParseState::ClosedTag => $self.$read_until_open($buf) $(.$await)?, ParseState::OpenedTag => $self.$read_until_close($buf) $(.$await)?, ParseState::Empty => $self.parser.close_expanded_empty(), ParseState::Exit => return Ok(Event::Eof), @@ -160,7 +178,7 @@ macro_rules! read_event_impl { macro_rules! read_until_open { ( - $self:ident, $buf:ident, $first:ident, + $self:ident, $buf:ident, $reader:expr, $read_event:ident $(, $await:ident)? @@ -180,7 +198,7 @@ macro_rules! read_until_open { .read_bytes_until(b'<', $buf, &mut $self.parser.offset) $(.$await)? { - Ok(Some(bytes)) => $self.parser.read_text(bytes, $first), + Ok(Some(bytes)) => $self.parser.read_text(bytes), Ok(None) => Ok(Event::Eof), Err(e) => Err(e), } @@ -557,15 +575,15 @@ impl Reader { where R: XmlSource<'i, B>, { - read_event_impl!(self, buf, read_until_open, read_until_close) + read_event_impl!(self, buf, self.reader, read_until_open, read_until_close) } /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event. - fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result> + fn read_until_open<'i, B>(&mut self, buf: B) -> Result> where R: XmlSource<'i, B>, { - read_until_open!(self, buf, first, self.reader, read_event_impl) + read_until_open!(self, buf, self.reader, read_event_impl) } /// Private function to read until `>` is found. This function expects that @@ -595,6 +613,14 @@ impl Reader { /// - `B`: a type of a buffer that can be used to store data read from `Self` and /// from which events can borrow trait XmlSource<'r, B> { + /// Removes UTF-8 BOM if it is present + #[cfg(not(feature = "encoding"))] + fn remove_utf8_bom(&mut self) -> Result<()>; + + /// Determines encoding from the start of input and removes BOM if it is present + #[cfg(feature = "encoding")] + fn detect_encoding(&mut self) -> Result>; + /// Read input until `byte` is found or end of input is reached. /// /// Returns a slice of data read up to `byte`, which does not include into result. @@ -1579,10 +1605,39 @@ mod test { use crate::reader::Reader; use pretty_assertions::assert_eq; + /// When `encoding` feature is enabled, encoding should be detected + /// from BOM (UTF-8) and BOM should be stripped. + /// + /// When `encoding` feature is disabled, UTF-8 is assumed and BOM + /// character should be stripped for consistency + #[$test] + $($async)? fn bom_from_reader() { + let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes()); + + assert_eq!( + reader.$read_event($buf) $(.$await)? .unwrap(), + Event::Text(BytesText::from_escaped("\u{feff}")) + ); + + assert_eq!( + reader.$read_event($buf) $(.$await)? .unwrap(), + Event::Eof + ); + } + + /// When parsing from &str, encoding is fixed (UTF-8), so + /// - when `encoding` feature is disabled, the behavior the + /// same as in `bom_from_reader` text + /// - when `encoding` feature is enabled, the behavior should + /// stay consistent, so the first BOM character is stripped #[$test] - #[should_panic] // Failure is expected until read_until_open() is smart enough to skip over irrelevant text events. - $($async)? fn bom_at_start() { - let mut reader = Reader::from_str("\u{feff}"); + $($async)? fn bom_from_str() { + let mut reader = Reader::from_str("\u{feff}\u{feff}"); + + assert_eq!( + reader.$read_event($buf) $(.$await)? .unwrap(), + Event::Text(BytesText::from_escaped("\u{feff}")) + ); assert_eq!( reader.$read_event($buf) $(.$await)? .unwrap(), diff --git a/src/reader/parser.rs b/src/reader/parser.rs index 9c146f9c..7c3e9eba 100644 --- a/src/reader/parser.rs +++ b/src/reader/parser.rs @@ -1,7 +1,7 @@ #[cfg(feature = "encoding")] use encoding_rs::UTF_8; -use crate::encoding::{self, Decoder}; +use crate::encoding::Decoder; use crate::errors::{Error, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; #[cfg(feature = "encoding")] @@ -63,11 +63,9 @@ impl Parser { /// /// # Parameters /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<` - /// - `first`: if `true`, then this is the first call of that function, - /// i. e. data from the start of stream and the bytes will be checked for a BOM. /// /// [`Text`]: Event::Text - pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result> { + pub fn read_text<'b>(&mut self, bytes: &'b [u8]) -> Result> { let mut content = bytes; if self.trim_text_end { @@ -79,20 +77,6 @@ impl Parser { content = &bytes[..len]; } - if first { - #[cfg(feature = "encoding")] - if self.encoding.can_be_refined() { - if let Some(encoding) = encoding::detect_encoding(bytes) { - self.encoding = EncodingRef::BomDetected(encoding); - content = encoding::remove_bom(content, encoding); - } - } - #[cfg(not(feature = "encoding"))] - if bytes.starts_with(encoding::UTF8_BOM) { - content = &bytes[encoding::UTF8_BOM.len()..]; - } - } - Ok(Event::Text(BytesText::wrap(content, self.decoder()))) } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 6416021d..b90fa524 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -7,7 +7,7 @@ use std::borrow::Cow; #[cfg(feature = "encoding")] use crate::reader::EncodingRef; #[cfg(feature = "encoding")] -use encoding_rs::UTF_8; +use encoding_rs::{Encoding, UTF_8}; use crate::errors::{Error, Result}; use crate::events::Event; @@ -236,6 +236,23 @@ impl<'a> Reader<&'a [u8]> { /// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer /// that will be borrowed by events. This implementation provides a zero-copy deserialization impl<'a> XmlSource<'a, ()> for &'a [u8] { + #[cfg(not(feature = "encoding"))] + fn remove_utf8_bom(&mut self) -> Result<()> { + if self.starts_with(crate::encoding::UTF8_BOM) { + *self = &self[crate::encoding::UTF8_BOM.len()..]; + } + Ok(()) + } + + #[cfg(feature = "encoding")] + fn detect_encoding(&mut self) -> Result> { + if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) { + *self = &self[bom_len..]; + return Ok(Some(enc)); + } + Ok(None) + } + fn read_bytes_until( &mut self, byte: u8, diff --git a/tests/encodings.rs b/tests/encodings.rs index 1fdf3b21..fa721e93 100644 --- a/tests/encodings.rs +++ b/tests/encodings.rs @@ -18,11 +18,11 @@ mod decode { #[test] fn test_detect_encoding() { // No BOM - assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some(UTF_8)); + assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some((UTF_8, 0))); // BOM - assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some(UTF_8)); - assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some(UTF_16BE)); - assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some(UTF_16LE)); + assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some((UTF_8, 3))); + assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some((UTF_16BE, 2))); + assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some((UTF_16LE, 2))); } }