Skip to content

Commit

Permalink
Correctly detect UTF-16 encoding even without BOM
Browse files Browse the repository at this point in the history
Fixes 2 tests
  • Loading branch information
Mingun authored and dralley committed Aug 27, 2022
1 parent 7f34520 commit 60dc37f
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 75 deletions.
44 changes: 15 additions & 29 deletions src/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,29 +99,14 @@ pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b
.ok_or(Error::NonDecodable(None))
}

#[cfg(feature = "encoding")]
fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
if encoding == UTF_8 && bytes.starts_with(UTF8_BOM) {
bytes.split_at(3)
} else if encoding == UTF_16LE && bytes.starts_with(UTF16_LE_BOM) {
bytes.split_at(2)
} else if encoding == UTF_16BE && bytes.starts_with(UTF16_BE_BOM) {
bytes.split_at(2)
} else {
(&[], bytes)
}
}

#[cfg(feature = "encoding")]
pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
let (_, bytes) = split_at_bom(bytes, encoding);
bytes
}

/// Automatic encoding detection of XML files based using the
/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
///
/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
/// If encoding is detected, `Some` is returned with an encoding and size of BOM
/// in bytes, if detection was performed using BOM, or zero, if detection was
/// performed without BOM.
///
/// IF encoding was not recognized, `None` is returned.
///
/// Because the [`encoding_rs`] crate supports only subset of those encodings, only
/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
Expand All @@ -131,25 +116,26 @@ pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'
///
/// | Bytes |Detected encoding
/// |-------------|------------------------------------------
/// |`FE FF ## ##`|UTF-16, big-endian
/// | **BOM**
/// |`FE_FF_##_##`|UTF-16, big-endian
/// |`FF FE ## ##`|UTF-16, little-endian
/// |`EF BB BF` |UTF-8
/// |-------------|------------------------------------------
/// | **No BOM**
/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
#[cfg(feature = "encoding")]
pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> {
match bytes {
// with BOM
_ if bytes.starts_with(UTF16_BE_BOM) => Some(UTF_16BE),
_ if bytes.starts_with(UTF16_LE_BOM) => Some(UTF_16LE),
_ if bytes.starts_with(UTF8_BOM) => Some(UTF_8),
_ if bytes.starts_with(UTF16_BE_BOM) => Some((UTF_16BE, 2)),
_ if bytes.starts_with(UTF16_LE_BOM) => Some((UTF_16LE, 2)),
_ if bytes.starts_with(UTF8_BOM) => Some((UTF_8, 3)),

// without BOM
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
_ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
_ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some((UTF_16BE, 0)), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
_ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some((UTF_16LE, 0)), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
_ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some((UTF_8, 0)), // Some ASCII compatible

_ => None,
}
Expand Down
20 changes: 10 additions & 10 deletions src/reader/async_tokio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,13 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
buf: &'b mut Vec<u8>,
) -> Pin<Box<dyn Future<Output = Result<Event<'b>>> + 'reader>> {
Box::pin(async move {
read_event_impl!(self, buf, read_until_open_async, read_until_close_async, await)
read_event_impl!(
self, buf,
TokioAdapter(&mut self.reader),
read_until_open_async,
read_until_close_async,
await
)
})
}

Expand Down Expand Up @@ -148,15 +154,9 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
}

/// Read until '<' is found and moves reader to an `Opened` state.
///
/// Return a `StartText` event if `first` is `true` and a `Text` event otherwise
async fn read_until_open_async<'b>(
&mut self,
buf: &'b mut Vec<u8>,
first: bool,
) -> Result<Event<'b>> {
read_until_open!(self, buf, first, TokioAdapter(&mut self.reader), read_event_into_async, await)
/// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
async fn read_until_open_async<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
read_until_open!(self, buf, TokioAdapter(&mut self.reader), read_event_into_async, await)
}

/// Private function to read until `>` is found. This function expects that
Expand Down
38 changes: 35 additions & 3 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,40 @@ use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, Xml

macro_rules! impl_buffered_source {
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
#[cfg(not(feature = "encoding"))]
$($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
use crate::encoding::UTF8_BOM;

loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) => {
if n.starts_with(UTF8_BOM) {
self $(.$reader)? .consume(UTF8_BOM.len());
}
Ok(())
},
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e)),
};
}
}

#[cfg(feature = "encoding")]
$($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
self $(.$reader)? .consume(bom_len);
Ok(Some(enc))
} else {
Ok(None)
},
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e)),
};
}
}

#[inline]
$($async)? fn read_bytes_until $(<$lf>)? (
&mut self,
Expand Down Expand Up @@ -396,16 +430,14 @@ mod test {
use pretty_assertions::assert_eq;

/// Checks that encoding is detected by BOM and changed after XML declaration
/// BOM indicates UTF-16LE, but XML - windows-1251
#[test]
fn bom_detected() {
let mut reader =
Reader::from_reader(b"\xFF\xFE<?xml encoding='windows-1251'?>".as_ref());
let mut buf = Vec::new();

assert_eq!(reader.decoder().encoding(), UTF_8);
reader.read_event_into(&mut buf).unwrap();
assert_eq!(reader.decoder().encoding(), UTF_16LE);

reader.read_event_into(&mut buf).unwrap();
assert_eq!(reader.decoder().encoding(), WINDOWS_1251);

Expand Down
75 changes: 65 additions & 10 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,31 @@ macro_rules! configure_methods {
macro_rules! read_event_impl {
(
$self:ident, $buf:ident,
$reader:expr,
$read_until_open:ident,
$read_until_close:ident
$(, $await:ident)?
) => {{
let event = match $self.parser.state {
ParseState::Init => $self.$read_until_open($buf, true) $(.$await)?,
ParseState::ClosedTag => $self.$read_until_open($buf, false) $(.$await)?,
ParseState::Init => {
// If encoding set explicitly, we not need to detect it. For example,
// explicit UTF-8 set automatically if Reader was created using `from_str`.
// But we still need to remove BOM for consistency with no encoding
// feature enabled path
#[cfg(feature = "encoding")]
if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
if $self.parser.encoding.can_be_refined() {
$self.parser.encoding = crate::reader::EncodingRef::BomDetected(encoding);
}
}

// Removes UTF-8 BOM if it is present
#[cfg(not(feature = "encoding"))]
$reader.remove_utf8_bom() $(.$await)? ?;

$self.$read_until_open($buf) $(.$await)?
},
ParseState::ClosedTag => $self.$read_until_open($buf) $(.$await)?,
ParseState::OpenedTag => $self.$read_until_close($buf) $(.$await)?,
ParseState::Empty => $self.parser.close_expanded_empty(),
ParseState::Exit => return Ok(Event::Eof),
Expand All @@ -160,7 +178,7 @@ macro_rules! read_event_impl {

macro_rules! read_until_open {
(
$self:ident, $buf:ident, $first:ident,
$self:ident, $buf:ident,
$reader:expr,
$read_event:ident
$(, $await:ident)?
Expand All @@ -180,7 +198,7 @@ macro_rules! read_until_open {
.read_bytes_until(b'<', $buf, &mut $self.parser.offset)
$(.$await)?
{
Ok(Some(bytes)) => $self.parser.read_text(bytes, $first),
Ok(Some(bytes)) => $self.parser.read_text(bytes),
Ok(None) => Ok(Event::Eof),
Err(e) => Err(e),
}
Expand Down Expand Up @@ -557,15 +575,15 @@ impl<R> Reader<R> {
where
R: XmlSource<'i, B>,
{
read_event_impl!(self, buf, read_until_open, read_until_close)
read_event_impl!(self, buf, self.reader, read_until_open, read_until_close)
}

/// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result<Event<'i>>
fn read_until_open<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
where
R: XmlSource<'i, B>,
{
read_until_open!(self, buf, first, self.reader, read_event_impl)
read_until_open!(self, buf, self.reader, read_event_impl)
}

/// Private function to read until `>` is found. This function expects that
Expand Down Expand Up @@ -595,6 +613,14 @@ impl<R> Reader<R> {
/// - `B`: a type of a buffer that can be used to store data read from `Self` and
/// from which events can borrow
trait XmlSource<'r, B> {
/// Removes UTF-8 BOM if it is present
#[cfg(not(feature = "encoding"))]
fn remove_utf8_bom(&mut self) -> Result<()>;

/// Determines encoding from the start of input and removes BOM if it is present
#[cfg(feature = "encoding")]
fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>>;

/// Read input until `byte` is found or end of input is reached.
///
/// Returns a slice of data read up to `byte`, which does not include into result.
Expand Down Expand Up @@ -1579,10 +1605,39 @@ mod test {
use crate::reader::Reader;
use pretty_assertions::assert_eq;

/// When `encoding` feature is enabled, encoding should be detected
/// from BOM (UTF-8) and BOM should be stripped.
///
/// When `encoding` feature is disabled, UTF-8 is assumed and BOM
/// character should be stripped for consistency
#[$test]
$($async)? fn bom_from_reader() {
let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());

assert_eq!(
reader.$read_event($buf) $(.$await)? .unwrap(),
Event::Text(BytesText::from_escaped("\u{feff}"))
);

assert_eq!(
reader.$read_event($buf) $(.$await)? .unwrap(),
Event::Eof
);
}

/// When parsing from &str, encoding is fixed (UTF-8), so
/// - when `encoding` feature is disabled, the behavior the
/// same as in `bom_from_reader` text
/// - when `encoding` feature is enabled, the behavior should
/// stay consistent, so the first BOM character is stripped
#[$test]
#[should_panic] // Failure is expected until read_until_open() is smart enough to skip over irrelevant text events.
$($async)? fn bom_at_start() {
let mut reader = Reader::from_str("\u{feff}");
$($async)? fn bom_from_str() {
let mut reader = Reader::from_str("\u{feff}\u{feff}");

assert_eq!(
reader.$read_event($buf) $(.$await)? .unwrap(),
Event::Text(BytesText::from_escaped("\u{feff}"))
);

assert_eq!(
reader.$read_event($buf) $(.$await)? .unwrap(),
Expand Down
20 changes: 2 additions & 18 deletions src/reader/parser.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#[cfg(feature = "encoding")]
use encoding_rs::UTF_8;

use crate::encoding::{self, Decoder};
use crate::encoding::Decoder;
use crate::errors::{Error, Result};
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
#[cfg(feature = "encoding")]
Expand Down Expand Up @@ -63,11 +63,9 @@ impl Parser {
///
/// # Parameters
/// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
/// - `first`: if `true`, then this is the first call of that function,
/// i. e. data from the start of stream and the bytes will be checked for a BOM.
///
/// [`Text`]: Event::Text
pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
pub fn read_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
let mut content = bytes;

if self.trim_text_end {
Expand All @@ -79,20 +77,6 @@ impl Parser {
content = &bytes[..len];
}

if first {
#[cfg(feature = "encoding")]
if self.encoding.can_be_refined() {
if let Some(encoding) = encoding::detect_encoding(bytes) {
self.encoding = EncodingRef::BomDetected(encoding);
content = encoding::remove_bom(content, encoding);
}
}
#[cfg(not(feature = "encoding"))]
if bytes.starts_with(encoding::UTF8_BOM) {
content = &bytes[encoding::UTF8_BOM.len()..];
}
}

Ok(Event::Text(BytesText::wrap(content, self.decoder())))
}

Expand Down
19 changes: 18 additions & 1 deletion src/reader/slice_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::borrow::Cow;
#[cfg(feature = "encoding")]
use crate::reader::EncodingRef;
#[cfg(feature = "encoding")]
use encoding_rs::UTF_8;
use encoding_rs::{Encoding, UTF_8};

use crate::errors::{Error, Result};
use crate::events::Event;
Expand Down Expand Up @@ -236,6 +236,23 @@ impl<'a> Reader<&'a [u8]> {
/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
/// that will be borrowed by events. This implementation provides a zero-copy deserialization
impl<'a> XmlSource<'a, ()> for &'a [u8] {
#[cfg(not(feature = "encoding"))]
fn remove_utf8_bom(&mut self) -> Result<()> {
if self.starts_with(crate::encoding::UTF8_BOM) {
*self = &self[crate::encoding::UTF8_BOM.len()..];
}
Ok(())
}

#[cfg(feature = "encoding")]
fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>> {
if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
*self = &self[bom_len..];
return Ok(Some(enc));
}
Ok(None)
}

fn read_bytes_until(
&mut self,
byte: u8,
Expand Down
8 changes: 4 additions & 4 deletions tests/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ mod decode {
#[test]
fn test_detect_encoding() {
// No BOM
assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some(UTF_8));
assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some((UTF_8, 0)));
// BOM
assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some(UTF_8));
assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some(UTF_16BE));
assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some(UTF_16LE));
assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some((UTF_8, 3)));
assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some((UTF_16BE, 2)));
assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some((UTF_16LE, 2)));
}
}

Expand Down

0 comments on commit 60dc37f

Please sign in to comment.