Correctly detect UTF-16 encoding even without BOM

Fixes 2 tests
tafia · Aug 27, 2022 · 60dc37f · 60dc37f
1 parent 7f34520
commit 60dc37f
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 75 deletions.
diff --git a/src/encoding.rs b/src/encoding.rs
@@ -99,29 +99,14 @@ pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b
         .ok_or(Error::NonDecodable(None))
 }
 
-#[cfg(feature = "encoding")]
-fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
-    if encoding == UTF_8 && bytes.starts_with(UTF8_BOM) {
-        bytes.split_at(3)
-    } else if encoding == UTF_16LE && bytes.starts_with(UTF16_LE_BOM) {
-        bytes.split_at(2)
-    } else if encoding == UTF_16BE && bytes.starts_with(UTF16_BE_BOM) {
-        bytes.split_at(2)
-    } else {
-        (&[], bytes)
-    }
-}
-
-#[cfg(feature = "encoding")]
-pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
-    let (_, bytes) = split_at_bom(bytes, encoding);
-    bytes
-}
-
 /// Automatic encoding detection of XML files based using the
 /// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
 ///
-/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
+/// If encoding is detected, `Some` is returned with an encoding and size of BOM
+/// in bytes, if detection was performed using BOM, or zero, if detection was
+/// performed without BOM.
+///
+/// IF encoding was not recognized, `None` is returned.
 ///
 /// Because the [`encoding_rs`] crate supports only subset of those encodings, only
 /// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
@@ -131,25 +116,26 @@ pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'
 ///
 /// | Bytes       |Detected encoding
 /// |-------------|------------------------------------------
-/// |`FE FF ## ##`|UTF-16, big-endian
+/// | **BOM**
+/// |`FE_FF_##_##`|UTF-16, big-endian
 /// |`FF FE ## ##`|UTF-16, little-endian
 /// |`EF BB BF`   |UTF-8
-/// |-------------|------------------------------------------
+/// | **No BOM**
 /// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
 /// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
 /// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
 #[cfg(feature = "encoding")]
-pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
+pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> {
     match bytes {
         // with BOM
-        _ if bytes.starts_with(UTF16_BE_BOM) => Some(UTF_16BE),
-        _ if bytes.starts_with(UTF16_LE_BOM) => Some(UTF_16LE),
-        _ if bytes.starts_with(UTF8_BOM) => Some(UTF_8),
+        _ if bytes.starts_with(UTF16_BE_BOM) => Some((UTF_16BE, 2)),
+        _ if bytes.starts_with(UTF16_LE_BOM) => Some((UTF_16LE, 2)),
+        _ if bytes.starts_with(UTF8_BOM) => Some((UTF_8, 3)),
 
         // without BOM
-        _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
-        _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
-        _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible
+        _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some((UTF_16BE, 0)), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
+        _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some((UTF_16LE, 0)), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
+        _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some((UTF_8, 0)), // Some ASCII compatible
 
         _ => None,
     }

diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs
@@ -86,7 +86,13 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
         buf: &'b mut Vec<u8>,
     ) -> Pin<Box<dyn Future<Output = Result<Event<'b>>> + 'reader>> {
         Box::pin(async move {
-            read_event_impl!(self, buf, read_until_open_async, read_until_close_async, await)
+            read_event_impl!(
+                self, buf,
+                TokioAdapter(&mut self.reader),
+                read_until_open_async,
+                read_until_close_async,
+                await
+            )
         })
     }
 
@@ -148,15 +154,9 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
         Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
     }
 
-    /// Read until '<' is found and moves reader to an `Opened` state.
-    ///
-    /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise
-    async fn read_until_open_async<'b>(
-        &mut self,
-        buf: &'b mut Vec<u8>,
-        first: bool,
-    ) -> Result<Event<'b>> {
-        read_until_open!(self, buf, first, TokioAdapter(&mut self.reader), read_event_into_async, await)
+    /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
+    async fn read_until_open_async<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
+        read_until_open!(self, buf, TokioAdapter(&mut self.reader), read_event_into_async, await)
     }
 
     /// Private function to read until `>` is found. This function expects that

diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs
@@ -14,6 +14,40 @@ use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, Xml
 
 macro_rules! impl_buffered_source {
     ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
+        #[cfg(not(feature = "encoding"))]
+        $($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
+            use crate::encoding::UTF8_BOM;
+
+            loop {
+                break match self $(.$reader)? .fill_buf() $(.$await)? {
+                    Ok(n) => {
+                        if n.starts_with(UTF8_BOM) {
+                            self $(.$reader)? .consume(UTF8_BOM.len());
+                        }
+                        Ok(())
+                    },
+                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
+                    Err(e) => Err(Error::Io(e)),
+                };
+            }
+        }
+
+        #[cfg(feature = "encoding")]
+        $($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
+            loop {
+                break match self $(.$reader)? .fill_buf() $(.$await)? {
+                    Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
+                        self $(.$reader)? .consume(bom_len);
+                        Ok(Some(enc))
+                    } else {
+                        Ok(None)
+                    },
+                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
+                    Err(e) => Err(Error::Io(e)),
+                };
+            }
+        }
+
         #[inline]
         $($async)? fn read_bytes_until $(<$lf>)? (
             &mut self,
@@ -396,16 +430,14 @@ mod test {
         use pretty_assertions::assert_eq;
 
         /// Checks that encoding is detected by BOM and changed after XML declaration
+        /// BOM indicates UTF-16LE, but XML - windows-1251
         #[test]
         fn bom_detected() {
             let mut reader =
                 Reader::from_reader(b"\xFF\xFE<?xml encoding='windows-1251'?>".as_ref());
             let mut buf = Vec::new();
 
             assert_eq!(reader.decoder().encoding(), UTF_8);
-            reader.read_event_into(&mut buf).unwrap();
-            assert_eq!(reader.decoder().encoding(), UTF_16LE);
-
             reader.read_event_into(&mut buf).unwrap();
             assert_eq!(reader.decoder().encoding(), WINDOWS_1251);
 

diff --git a/src/reader/mod.rs b/src/reader/mod.rs
@@ -139,13 +139,31 @@ macro_rules! configure_methods {
 macro_rules! read_event_impl {
     (
         $self:ident, $buf:ident,
+        $reader:expr,
         $read_until_open:ident,
         $read_until_close:ident
         $(, $await:ident)?
     ) => {{
         let event = match $self.parser.state {
-            ParseState::Init => $self.$read_until_open($buf, true) $(.$await)?,
-            ParseState::ClosedTag => $self.$read_until_open($buf, false) $(.$await)?,
+            ParseState::Init => {
+                // If encoding set explicitly, we not need to detect it. For example,
+                // explicit UTF-8 set automatically if Reader was created using `from_str`.
+                // But we still need to remove BOM for consistency with no encoding
+                // feature enabled path
+                #[cfg(feature = "encoding")]
+                if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
+                    if $self.parser.encoding.can_be_refined() {
+                        $self.parser.encoding = crate::reader::EncodingRef::BomDetected(encoding);
+                    }
+                }
+
+                // Removes UTF-8 BOM if it is present
+                #[cfg(not(feature = "encoding"))]
+                $reader.remove_utf8_bom() $(.$await)? ?;
+
+                $self.$read_until_open($buf) $(.$await)?
+            },
+            ParseState::ClosedTag => $self.$read_until_open($buf) $(.$await)?,
             ParseState::OpenedTag => $self.$read_until_close($buf) $(.$await)?,
             ParseState::Empty => $self.parser.close_expanded_empty(),
             ParseState::Exit => return Ok(Event::Eof),
@@ -160,7 +178,7 @@ macro_rules! read_event_impl {
 
 macro_rules! read_until_open {
     (
-        $self:ident, $buf:ident, $first:ident,
+        $self:ident, $buf:ident,
         $reader:expr,
         $read_event:ident
         $(, $await:ident)?
@@ -180,7 +198,7 @@ macro_rules! read_until_open {
             .read_bytes_until(b'<', $buf, &mut $self.parser.offset)
             $(.$await)?
         {
-            Ok(Some(bytes)) => $self.parser.read_text(bytes, $first),
+            Ok(Some(bytes)) => $self.parser.read_text(bytes),
             Ok(None) => Ok(Event::Eof),
             Err(e) => Err(e),
         }
@@ -557,15 +575,15 @@ impl<R> Reader<R> {
     where
         R: XmlSource<'i, B>,
     {
-        read_event_impl!(self, buf, read_until_open, read_until_close)
+        read_event_impl!(self, buf, self.reader, read_until_open, read_until_close)
     }
 
     /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
-    fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result<Event<'i>>
+    fn read_until_open<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
     where
         R: XmlSource<'i, B>,
     {
-        read_until_open!(self, buf, first, self.reader, read_event_impl)
+        read_until_open!(self, buf, self.reader, read_event_impl)
     }
 
     /// Private function to read until `>` is found. This function expects that
@@ -595,6 +613,14 @@ impl<R> Reader<R> {
 /// - `B`: a type of a buffer that can be used to store data read from `Self` and
 ///   from which events can borrow
 trait XmlSource<'r, B> {
+    /// Removes UTF-8 BOM if it is present
+    #[cfg(not(feature = "encoding"))]
+    fn remove_utf8_bom(&mut self) -> Result<()>;
+
+    /// Determines encoding from the start of input and removes BOM if it is present
+    #[cfg(feature = "encoding")]
+    fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>>;
+
     /// Read input until `byte` is found or end of input is reached.
     ///
     /// Returns a slice of data read up to `byte`, which does not include into result.
@@ -1579,10 +1605,39 @@ mod test {
                 use crate::reader::Reader;
                 use pretty_assertions::assert_eq;
 
+                /// When `encoding` feature is enabled, encoding should be detected
+                /// from BOM (UTF-8) and BOM should be stripped.
+                ///
+                /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
+                /// character should be stripped for consistency
+                #[$test]
+                $($async)? fn bom_from_reader() {
+                    let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
+
+                    assert_eq!(
+                        reader.$read_event($buf) $(.$await)? .unwrap(),
+                        Event::Text(BytesText::from_escaped("\u{feff}"))
+                    );
+
+                    assert_eq!(
+                        reader.$read_event($buf) $(.$await)? .unwrap(),
+                        Event::Eof
+                    );
+                }
+
+                /// When parsing from &str, encoding is fixed (UTF-8), so
+                /// - when `encoding` feature is disabled, the behavior the
+                ///   same as in `bom_from_reader` text
+                /// - when `encoding` feature is enabled, the behavior should
+                ///   stay consistent, so the first BOM character is stripped
                 #[$test]
-                #[should_panic] // Failure is expected until read_until_open() is smart enough to skip over irrelevant text events.
-                $($async)? fn bom_at_start() {
-                    let mut reader = Reader::from_str("\u{feff}");
+                $($async)? fn bom_from_str() {
+                    let mut reader = Reader::from_str("\u{feff}\u{feff}");
+
+                    assert_eq!(
+                        reader.$read_event($buf) $(.$await)? .unwrap(),
+                        Event::Text(BytesText::from_escaped("\u{feff}"))
+                    );
 
                     assert_eq!(
                         reader.$read_event($buf) $(.$await)? .unwrap(),

diff --git a/src/reader/parser.rs b/src/reader/parser.rs
@@ -1,7 +1,7 @@
 #[cfg(feature = "encoding")]
 use encoding_rs::UTF_8;
 
-use crate::encoding::{self, Decoder};
+use crate::encoding::Decoder;
 use crate::errors::{Error, Result};
 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
 #[cfg(feature = "encoding")]
@@ -63,11 +63,9 @@ impl Parser {
     ///
     /// # Parameters
     /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
-    /// - `first`: if `true`, then this is the first call of that function,
-    ///   i. e. data from the start of stream and the bytes will be checked for a BOM.
     ///
     /// [`Text`]: Event::Text
-    pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
+    pub fn read_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
         let mut content = bytes;
 
         if self.trim_text_end {
@@ -79,20 +77,6 @@ impl Parser {
             content = &bytes[..len];
         }
 
-        if first {
-            #[cfg(feature = "encoding")]
-            if self.encoding.can_be_refined() {
-                if let Some(encoding) = encoding::detect_encoding(bytes) {
-                    self.encoding = EncodingRef::BomDetected(encoding);
-                    content = encoding::remove_bom(content, encoding);
-                }
-            }
-            #[cfg(not(feature = "encoding"))]
-            if bytes.starts_with(encoding::UTF8_BOM) {
-                content = &bytes[encoding::UTF8_BOM.len()..];
-            }
-        }
-
         Ok(Event::Text(BytesText::wrap(content, self.decoder())))
     }
 

diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs
@@ -7,7 +7,7 @@ use std::borrow::Cow;
 #[cfg(feature = "encoding")]
 use crate::reader::EncodingRef;
 #[cfg(feature = "encoding")]
-use encoding_rs::UTF_8;
+use encoding_rs::{Encoding, UTF_8};
 
 use crate::errors::{Error, Result};
 use crate::events::Event;
@@ -236,6 +236,23 @@ impl<'a> Reader<&'a [u8]> {
 /// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
 /// that will be borrowed by events. This implementation provides a zero-copy deserialization
 impl<'a> XmlSource<'a, ()> for &'a [u8] {
+    #[cfg(not(feature = "encoding"))]
+    fn remove_utf8_bom(&mut self) -> Result<()> {
+        if self.starts_with(crate::encoding::UTF8_BOM) {
+            *self = &self[crate::encoding::UTF8_BOM.len()..];
+        }
+        Ok(())
+    }
+
+    #[cfg(feature = "encoding")]
+    fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>> {
+        if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
+            *self = &self[bom_len..];
+            return Ok(Some(enc));
+        }
+        Ok(None)
+    }
+
     fn read_bytes_until(
         &mut self,
         byte: u8,

diff --git a/tests/encodings.rs b/tests/encodings.rs
@@ -18,11 +18,11 @@ mod decode {
     #[test]
     fn test_detect_encoding() {
         // No BOM
-        assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some(UTF_8));
+        assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some((UTF_8, 0)));
         // BOM
-        assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some(UTF_8));
-        assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some(UTF_16BE));
-        assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some(UTF_16LE));
+        assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some((UTF_8, 3)));
+        assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some((UTF_16BE, 2)));
+        assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some((UTF_16LE, 2)));
     }
 }