Skip to content

Commit

Permalink
Add allow_dangling_amp configuration option and allow dangling &
Browse files Browse the repository at this point in the history
  • Loading branch information
Mingun committed Oct 18, 2024
1 parent 142f4a9 commit 286b259
Show file tree
Hide file tree
Showing 5 changed files with 127 additions and 20 deletions.
3 changes: 3 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ XML specification. See the updated `custom_entities` example!

- [#766]: Allow to parse resolved entities as XML fragments and stream events from them.
- [#766]: Added new event `Event::GeneralRef` with content of [general entity].
- [#766]: Added new configuration option `allow_dangling_amp` which allows to have
a `&` not followed by `;` in the textual data which is required for some applications
for compatibility reasons.

### Bug Fixes

Expand Down
6 changes: 3 additions & 3 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ macro_rules! impl_buffered_source {

*position += read;

return ReadRefResult::UpToRef;
return ReadRefResult::UpToRef(&buf[start..]);
}
Some(i) => {
let is_end = available[i] == b';';
Expand All @@ -177,7 +177,7 @@ macro_rules! impl_buffered_source {
return if is_end {
ReadRefResult::Ref(&buf[start..])
} else {
ReadRefResult::UpToMarkup
ReadRefResult::UpToMarkup(&buf[start..])
};
}
None => {
Expand All @@ -191,7 +191,7 @@ macro_rules! impl_buffered_source {
}

*position += read;
ReadRefResult::UpToEof
ReadRefResult::UpToEof(&buf[start..])
}

#[inline]
Expand Down
62 changes: 50 additions & 12 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,32 @@ use crate::reader::state::ReaderState;
#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
#[non_exhaustive]
pub struct Config {
/// Whether lone ampersand character (without a paired semicolon) should be
/// allowed in textual content. Unless enabled, in case of a dangling ampersand,
/// the [`Error::IllFormed(UnclosedReference)`] is returned from read methods.
///
/// Default: `false`
///
/// # Example
///
/// ```
/// # use quick_xml::events::{BytesRef, BytesText, Event};
/// # use quick_xml::reader::Reader;
/// # use pretty_assertions::assert_eq;
/// let mut reader = Reader::from_str("text with & & & alone");
/// reader.config_mut().allow_dangling_amp = true;
///
/// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new("text with ")));
/// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& ")));
/// assert_eq!(reader.read_event().unwrap(), Event::GeneralRef(BytesRef::new("amp")));
/// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new(" ")));
/// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& alone")));
/// assert_eq!(reader.read_event().unwrap(), Event::Eof);
/// ```
///
/// [`Error::IllFormed(UnclosedReference)`]: crate::errors::IllFormedError::UnclosedReference
pub allow_dangling_amp: bool,

/// Whether unmatched closing tag names should be allowed. Unless enabled,
/// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
/// is returned from read methods.
Expand Down Expand Up @@ -210,6 +236,7 @@ impl Config {
impl Default for Config {
fn default() -> Self {
Self {
allow_dangling_amp: false,
allow_unmatched_ends: false,
check_comments: false,
check_end_names: true,
Expand Down Expand Up @@ -261,18 +288,29 @@ macro_rules! read_event_impl {
Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..], $self.decoder())))
}
// Go to Done state
ReadRefResult::UpToEof => {
ReadRefResult::UpToEof(bytes) if $self.state.config.allow_dangling_amp => {
$self.state.state = ParseState::Done;
Ok(Event::Text($self.state.emit_text(bytes)))
}
ReadRefResult::UpToEof(_) => {
$self.state.state = ParseState::Done;
$self.state.last_error_offset = start;
Err(Error::IllFormed(IllFormedError::UnclosedReference))
}
// Do not change state, stay in InsideRef
ReadRefResult::UpToRef => {
ReadRefResult::UpToRef(bytes) if $self.state.config.allow_dangling_amp => {
Ok(Event::Text($self.state.emit_text(bytes)))
}
ReadRefResult::UpToRef(_) => {
$self.state.last_error_offset = start;
Err(Error::IllFormed(IllFormedError::UnclosedReference))
}
// Go to InsideMarkup state
ReadRefResult::UpToMarkup => {
ReadRefResult::UpToMarkup(bytes) if $self.state.config.allow_dangling_amp => {
$self.state.state = ParseState::InsideMarkup;
Ok(Event::Text($self.state.emit_text(bytes)))
}
ReadRefResult::UpToMarkup(_) => {
$self.state.state = ParseState::InsideMarkup;
$self.state.last_error_offset = start;
Err(Error::IllFormed(IllFormedError::UnclosedReference))
Expand Down Expand Up @@ -997,13 +1035,13 @@ enum ReadRefResult<'r> {
/// Contains text block up to EOF. Neither end of reference (`;`), start of
/// another reference (`&`) or start of markup (`<`) characters was found.
/// Result includes start `&`.
UpToEof,
UpToEof(&'r [u8]),
/// Contains text block up to next possible reference (`&` character).
/// Result includes start `&`.
UpToRef,
UpToRef(&'r [u8]),
/// Contains text block up to start of markup (`<` character).
/// Result includes start `&`.
UpToMarkup,
UpToMarkup(&'r [u8]),
/// IO error occurred.
Err(io::Error),
}
Expand Down Expand Up @@ -1722,8 +1760,8 @@ mod test {
// ^= 2

match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
ReadRefResult::UpToEof => (),
x => panic!("Expected `UpToEof`, but got `{:?}`", x),
ReadRefResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
}
assert_eq!(position, 2);
}
Expand All @@ -1736,8 +1774,8 @@ mod test {
// ^= 2

match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
ReadRefResult::UpToRef => (),
x => panic!("Expected `UpToRef`, but got `{:?}`", x),
ReadRefResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
}
assert_eq!(position, 2);
}
Expand All @@ -1750,8 +1788,8 @@ mod test {
// ^= 3

match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
ReadRefResult::UpToMarkup => (),
x => panic!("Expected `UpToMarkup`, but got `{:?}`", x),
ReadRefResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
}
assert_eq!(position, 3);
}
Expand Down
8 changes: 4 additions & 4 deletions src/reader/slice_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -306,11 +306,11 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
// Do not consume `&` because it may be lone and we would be need to
// return it as part of Text event
Some(i) if self[i + 1] == b'&' => {
let (_, rest) = self.split_at(i + 1);
let (bytes, rest) = self.split_at(i + 1);
*self = rest;
*position += i as u64 + 1;

ReadRefResult::UpToRef
ReadRefResult::UpToRef(bytes)
}
Some(i) => {
let end = i + 1;
Expand All @@ -323,15 +323,15 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
if is_end {
ReadRefResult::Ref(bytes)
} else {
ReadRefResult::UpToMarkup
ReadRefResult::UpToMarkup(bytes)
}
}
None => {
let bytes = &self[..];
*self = &[];
*position += bytes.len() as u64;

ReadRefResult::UpToEof
ReadRefResult::UpToEof(bytes)
}
}
}
Expand Down
68 changes: 67 additions & 1 deletion tests/reader-config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,75 @@
//! Please keep tests sorted (exceptions are allowed if options are tightly related).
use quick_xml::errors::{Error, IllFormedError};
use quick_xml::events::{BytesCData, BytesEnd, BytesPI, BytesStart, BytesText, Event};
use quick_xml::events::{BytesCData, BytesEnd, BytesPI, BytesRef, BytesStart, BytesText, Event};
use quick_xml::reader::Reader;

mod allow_dangling_amp {
use super::*;
use pretty_assertions::assert_eq;

#[test]
fn false_() {
let mut reader = Reader::from_str("&&&lt;&");
reader.config_mut().allow_dangling_amp = false;

match reader.read_event() {
Err(Error::IllFormed(cause)) => {
assert_eq!(cause, IllFormedError::UnclosedReference);
}
x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x),
}
assert_eq!(reader.error_position()..reader.buffer_position(), 0..1);

match reader.read_event() {
Err(Error::IllFormed(cause)) => {
assert_eq!(cause, IllFormedError::UnclosedReference);
}
x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x),
}
assert_eq!(reader.error_position()..reader.buffer_position(), 1..2);

assert_eq!(
reader.read_event().unwrap(),
Event::GeneralRef(BytesRef::new("lt"))
);
match reader.read_event() {
Err(Error::IllFormed(cause)) => {
assert_eq!(cause, IllFormedError::UnclosedReference);
}
x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x),
}
assert_eq!(reader.error_position()..reader.buffer_position(), 6..7);

assert_eq!(reader.read_event().unwrap(), Event::Eof);
assert_eq!(reader.error_position()..reader.buffer_position(), 6..7);
}

#[test]
fn true_() {
let mut reader = Reader::from_str("&&&lt;&");
reader.config_mut().allow_dangling_amp = true;

assert_eq!(
reader.read_event().unwrap(),
Event::Text(BytesText::from_escaped("&"))
);
assert_eq!(
reader.read_event().unwrap(),
Event::Text(BytesText::from_escaped("&"))
);
assert_eq!(
reader.read_event().unwrap(),
Event::GeneralRef(BytesRef::new("lt"))
);
assert_eq!(
reader.read_event().unwrap(),
Event::Text(BytesText::from_escaped("&"))
);
assert_eq!(reader.read_event().unwrap(), Event::Eof);
}
}

mod allow_unmatched_ends {
use super::*;
use pretty_assertions::assert_eq;
Expand Down

0 comments on commit 286b259

Please sign in to comment.