tafia · dralley · Aug 27, 2022 · Aug 16, 2022 · Aug 19, 2022 · Aug 26, 2022
diff --git a/.gitattributes b/.gitattributes
@@ -1,6 +1,9 @@
 # Unit tests assume that all xml files have unix style line endings
 /tests/documents/* text eol=lf
+/tests/documents/encoding/* text eol=lf
 
-/tests/documents/utf16be.xml binary
-/tests/documents/utf16le.xml binary
+/tests/documents/encoding/utf16be.xml binary
+/tests/documents/encoding/utf16le.xml binary
+/tests/documents/encoding/utf16be-bom.xml binary
+/tests/documents/encoding/utf16le-bom.xml binary
 /tests/documents/sample_5_utf16bom.xml binary
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "encoding"]
+	path = test-gen/encoding
+	url = https://github.com/whatwg/encoding.git
+	shallow = true
diff --git a/Cargo.toml b/Cargo.toml
@@ -53,10 +53,55 @@ async-tokio = ["tokio"]
 ## Currently, only ASCII-compatible encodings are supported, so, for example,
 ## UTF-16 will not work (therefore, `quick-xml` is not [standard compliant]).
 ##
-## List of supported encodings includes all encodings supported by [`encoding_rs`]
-## crate, that satisfied the restriction above.
+## Thus, quick-xml supports all encodings of [`encoding_rs`] except these:
+## - [UTF-16BE]
+## - [UTF-16LE]
+## - [ISO-2022-JP]
+##
+## You should stop to process document when one of that encoding will be detected,
+## because generated events can be wrong and do not reflect a real document structure!
+##
+## Because there is only supported encodings that is not ASCII compatible, you can
+## check for that to detect them:
+##
+## ```
+## use quick_xml::events::Event;
+## use quick_xml::reader::Reader;
+##
+## # fn to_utf16le_with_bom(string: &str) -> Vec<u8> {
+## #     let mut bytes = Vec::new();
+## #     bytes.extend_from_slice(&[0xFF, 0xFE]); // UTF-16 LE BOM
+## #     for ch in string.encode_utf16() {
+## #         bytes.extend_from_slice(&ch.to_le_bytes());
+## #     }
+## #     bytes
+## # }
+## let xml = to_utf16le_with_bom(r#"<?xml encoding='UTF-16'><element/>"#);
+## let mut reader = Reader::from_reader(xml.as_ref());
+## reader.trim_text(true);
+##
+## let mut buf = Vec::new();
+## let mut unsupported = false;
+## loop {
+##     if !reader.decoder().encoding().is_ascii_compatible() {
+##         unsupported = true;
+##         break;
+##     }
+##     buf.clear();
+##     match reader.read_event_into(&mut buf).unwrap() {
+##         Event::Eof => break,
+##         _ => {}
+##     }
+## }
+## assert_eq!(unsupported, true);
+## ```
+## That restriction will be eliminated once issue [#158] is resolved.
 ##
 ## [standard compliant]: https://www.w3.org/TR/xml11/#charencoding
+## [UTF-16BE]: encoding_rs::UTF_16BE
+## [UTF-16LE]: encoding_rs::UTF_16LE
+## [ISO-2022-JP]: encoding_rs::ISO_2022_JP
+## [#158]: https://github.com/tafia/quick-xml/issues/158
 encoding = ["encoding_rs"]
 
 ## Enables support for recognizing all [HTML 5 entities] in [`unescape`] and
@@ -125,6 +170,10 @@ all-features = true
 # See https://stackoverflow.com/questions/61417452
 rustdoc-args = ["--cfg", "docs_rs"]
 
+[[test]]
+name = "encodings"
+required-features = ["encoding"]
+
 [[test]]
 name = "serde_attrs"
 required-features = ["serialize"]

diff --git a/Changelog.md b/Changelog.md
@@ -35,8 +35,7 @@
   |                        |`resolve`
   |`event_namespace`       |`resolve_element`
   |`attribute_namespace`   |`resolve_attribute`
-- [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
-  under the `quick-xml::encoding` namespace.
+- [#439]: Added utilities `detect_encoding()` and `decode()` under the `quick-xml::encoding` namespace.
 - [#450]: Added support of asynchronous [tokio](https://tokio.rs/) readers
 - [#455]: Change return type of all `read_to_end*` methods to return a span between tags
 - [#455]: Added `Reader::read_text` method to return a raw content (including markup) between tags
@@ -112,7 +111,7 @@
 - [#191]: Remove `*_without_bom` methods from the `Attributes` struct because they are useless.
   Use the same-named methods without that suffix instead. Attribute values cannot contain BOM
 - [#191]: Remove `Reader::decode()` and `Reader::decode_without_bom()`, they are replaced by
-  `Decoder::decode()` and `Decoder::decode_with_bom_removal()`.
+  `Decoder::decode()` and nothing.
   Use `reader.decoder().decode_*(...)` instead of `reader.decode_*(...)` for now.
   `Reader::encoding()` is replaced by `Decoder::encoding()` as well
 - [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is

diff --git a/src/encoding.rs b/src/encoding.rs
@@ -28,7 +28,7 @@ pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
 /// key is not defined or contains unknown encoding.
 ///
 /// The library supports any UTF-8 compatible encodings that crate `encoding_rs`
-/// is supported. [*UTF-16 is not supported at the present*][utf16].
+/// is supported. [*UTF-16 and ISO-2022-JP are not supported at the present*][utf16].
 ///
 /// If feature `encoding` is disabled, the decoder is always UTF-8 decoder:
 /// any XML declarations are ignored.
@@ -54,66 +54,38 @@ impl Decoder {
     }
 }
 
-#[cfg(not(feature = "encoding"))]
-impl Decoder {
-    /// Decodes a UTF8 slice regardless of XML declaration and ignoring BOM if
-    /// it is present in the `bytes`.
-    ///
-    /// Returns an error in case of malformed sequences in the `bytes`.
-    ///
-    /// If you instead want to use XML declared encoding, use the `encoding` feature
-    #[inline]
-    pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
-        Ok(Cow::Borrowed(std::str::from_utf8(bytes)?))
-    }
-
-    /// Decodes a slice regardless of XML declaration with BOM removal if
-    /// it is present in the `bytes`.
-    ///
-    /// Returns an error in case of malformed sequences in the `bytes`.
-    ///
-    /// If you instead want to use XML declared encoding, use the `encoding` feature
-    pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
-        let bytes = if bytes.starts_with(UTF8_BOM) {
-            &bytes[3..]
-        } else {
-            bytes
-        };
-        self.decode(bytes)
-    }
-}
-
-#[cfg(feature = "encoding")]
 impl Decoder {
     /// Returns the `Reader`s encoding.
     ///
     /// This encoding will be used by [`decode`].
     ///
     /// [`decode`]: Self::decode
+    #[cfg(feature = "encoding")]
     pub const fn encoding(&self) -> &'static Encoding {
         self.encoding
     }
 
+    /// ## Without `encoding` feature
+    ///
+    /// Decodes an UTF-8 slice regardless of XML declaration and ignoring BOM
+    /// if it is present in the `bytes`.
+    ///
+    /// ## With `encoding` feature
+    ///
     /// Decodes specified bytes using encoding, declared in the XML, if it was
     /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
     /// in the `bytes`.
     ///
+    /// ----
     /// Returns an error in case of malformed sequences in the `bytes`.
     pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
-        decode(bytes, self.encoding)
-    }
+        #[cfg(not(feature = "encoding"))]
+        let decoded = Ok(Cow::Borrowed(std::str::from_utf8(bytes)?));
 
-    /// Decodes a slice with BOM removal if it is present in the `bytes` using
-    /// the reader encoding.
-    ///
-    /// If this method called after reading XML declaration with the `"encoding"`
-    /// key, then this encoding is used, otherwise UTF-8 is used.
-    ///
-    /// If XML declaration is absent in the XML, UTF-8 is used.
-    ///
-    /// Returns an error in case of malformed sequences in the `bytes`.
-    pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
-        self.decode(remove_bom(bytes, self.encoding))
+        #[cfg(feature = "encoding")]
+        let decoded = decode(bytes, self.encoding);
+
+        decoded
     }
 }
 
@@ -127,43 +99,14 @@ pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b
         .ok_or(Error::NonDecodable(None))
 }
 
-/// Decodes a slice with an unknown encoding, removing the BOM if it is present
-/// in the bytes.
-///
-/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
-#[cfg(feature = "encoding")]
-pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
-    if let Some(encoding) = detect_encoding(bytes) {
-        let bytes = remove_bom(bytes, encoding);
-        decode(bytes, encoding)
-    } else {
-        decode(bytes, UTF_8)
-    }
-}
-
-#[cfg(feature = "encoding")]
-fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
-    if encoding == UTF_8 && bytes.starts_with(UTF8_BOM) {
-        bytes.split_at(3)
-    } else if encoding == UTF_16LE && bytes.starts_with(UTF16_LE_BOM) {
-        bytes.split_at(2)
-    } else if encoding == UTF_16BE && bytes.starts_with(UTF16_BE_BOM) {
-        bytes.split_at(2)
-    } else {
-        (&[], bytes)
-    }
-}
-
-#[cfg(feature = "encoding")]
-pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
-    let (_, bytes) = split_at_bom(bytes, encoding);
-    bytes
-}
-
 /// Automatic encoding detection of XML files based using the
 /// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
 ///
-/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
+/// If encoding is detected, `Some` is returned with an encoding and size of BOM
+/// in bytes, if detection was performed using BOM, or zero, if detection was
+/// performed without BOM.
+///
+/// IF encoding was not recognized, `None` is returned.
 ///
 /// Because the [`encoding_rs`] crate supports only subset of those encodings, only
 /// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
@@ -173,25 +116,26 @@ pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'
 ///
 /// | Bytes       |Detected encoding
 /// |-------------|------------------------------------------
-/// |`FE FF ## ##`|UTF-16, big-endian
+/// | **BOM**
+/// |`FE_FF_##_##`|UTF-16, big-endian
 /// |`FF FE ## ##`|UTF-16, little-endian
 /// |`EF BB BF`   |UTF-8
-/// |-------------|------------------------------------------
+/// | **No BOM**
 /// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
 /// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
 /// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
 #[cfg(feature = "encoding")]
-pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
+pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> {
     match bytes {
         // with BOM
-        _ if bytes.starts_with(UTF16_BE_BOM) => Some(UTF_16BE),
-        _ if bytes.starts_with(UTF16_LE_BOM) => Some(UTF_16LE),
-        _ if bytes.starts_with(UTF8_BOM) => Some(UTF_8),
+        _ if bytes.starts_with(UTF16_BE_BOM) => Some((UTF_16BE, 2)),
+        _ if bytes.starts_with(UTF16_LE_BOM) => Some((UTF_16LE, 2)),
+        _ if bytes.starts_with(UTF8_BOM) => Some((UTF_8, 3)),
 
         // without BOM
-        _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
-        _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
-        _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible
+        _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some((UTF_16BE, 0)), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
+        _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some((UTF_16LE, 0)), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
+        _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some((UTF_8, 0)), // Some ASCII compatible
 
         _ => None,
     }

diff --git a/src/reader/async_tokio.rs b/src/reader/async_tokio.rs
@@ -86,7 +86,13 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
         buf: &'b mut Vec<u8>,
     ) -> Pin<Box<dyn Future<Output = Result<Event<'b>>> + 'reader>> {
         Box::pin(async move {
-            read_event_impl!(self, buf, read_until_open_async, read_until_close_async, await)
+            read_event_impl!(
+                self, buf,
+                TokioAdapter(&mut self.reader),
+                read_until_open_async,
+                read_until_close_async,
+                await
+            )
         })
     }
 
@@ -148,15 +154,9 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
         Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
     }
 
-    /// Read until '<' is found and moves reader to an `Opened` state.
-    ///
-    /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise
-    async fn read_until_open_async<'b>(
-        &mut self,
-        buf: &'b mut Vec<u8>,
-        first: bool,
-    ) -> Result<Event<'b>> {
-        read_until_open!(self, buf, first, TokioAdapter(&mut self.reader), read_event_into_async, await)
+    /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
+    async fn read_until_open_async<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
+        read_until_open!(self, buf, TokioAdapter(&mut self.reader), read_event_into_async, await)
     }
 
     /// Private function to read until `>` is found. This function expects that

diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs
@@ -14,6 +14,40 @@ use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, Xml
 
 macro_rules! impl_buffered_source {
     ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
+        #[cfg(not(feature = "encoding"))]
+        $($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
+            use crate::encoding::UTF8_BOM;
+
+            loop {
+                break match self $(.$reader)? .fill_buf() $(.$await)? {
+                    Ok(n) => {
+                        if n.starts_with(UTF8_BOM) {
+                            self $(.$reader)? .consume(UTF8_BOM.len());
+                        }
+                        Ok(())
+                    },
+                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
+                    Err(e) => Err(Error::Io(e)),
+                };
+            }
+        }
+
+        #[cfg(feature = "encoding")]
+        $($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
+            loop {
+                break match self $(.$reader)? .fill_buf() $(.$await)? {
+                    Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
+                        self $(.$reader)? .consume(bom_len);
+                        Ok(Some(enc))
+                    } else {
+                        Ok(None)
+                    },
+                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
+                    Err(e) => Err(Error::Io(e)),
+                };
+            }
+        }
+
         #[inline]
         $($async)? fn read_bytes_until $(<$lf>)? (
             &mut self,
@@ -396,16 +430,14 @@ mod test {
         use pretty_assertions::assert_eq;
 
         /// Checks that encoding is detected by BOM and changed after XML declaration
+        /// BOM indicates UTF-16LE, but XML - windows-1251
         #[test]
         fn bom_detected() {
             let mut reader =
                 Reader::from_reader(b"\xFF\xFE<?xml encoding='windows-1251'?>".as_ref());
             let mut buf = Vec::new();
 
             assert_eq!(reader.decoder().encoding(), UTF_8);
-            reader.read_event_into(&mut buf).unwrap();
-            assert_eq!(reader.decoder().encoding(), UTF_16LE);
-
             reader.read_event_into(&mut buf).unwrap();
             assert_eq!(reader.decoder().encoding(), WINDOWS_1251);