Remove StartText

StartText would be out of place once all events are expected to contain UTF-8. Additionally the decoder implementation strips BOM bytes out of the bytestream so there's no good way to access them.
tafia · Aug 16, 2022 · 1168dd4 · 1168dd4
1 parent a36aa8e
commit 1168dd4
Show file tree

Hide file tree

Showing 10 changed files with 10 additions and 282 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -20,8 +20,6 @@
 - [#180]: Make `Decoder` struct public. You already had access to it via the
   `Reader::decoder()` method, but could not name it in the code. Now the preferred
   way to access decoding functionality is via this struct
-- [#191]: New event variant `StartText` emitted for bytes before the XML declaration
-  or a start comment or a tag. For streams with BOM this event will contain a BOM
 - [#395]: Add support for XML Schema `xs:list`
 - [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore
   the XML declared encoding and always use UTF-8
@@ -102,15 +100,6 @@
   `Decoder::decode()` and `Decoder::decode_with_bom_removal()`.
   Use `reader.decoder().decode_*(...)` instead of `reader.decode_*(...)` for now.
   `Reader::encoding()` is replaced by `Decoder::encoding()` as well
-- [#191]: Remove poorly designed `BytesText::unescape_and_decode_without_bom()` and
-  `BytesText::unescape_and_decode_without_bom_with_custom_entities()`. Although these methods worked
-  as expected, this was only due to good luck. They was replaced by the
-  `BytesStartText::decode_with_bom_removal()`:
-  - conceptually, you should decode BOM only for the first `Text` event from the
-    reader (since now `StartText` event is emitted instead for this)
-  - text before the first tag is not an XML content at all, so it is meaningless
-    to try to unescape something in it
-
 - [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is
   disabled. Signatures of functions are now the same regardless of whether or not the feature is
   enabled, and an error will be returned instead of performing replacements for invalid characters

diff --git a/benches/microbenches.rs b/benches/microbenches.rs
@@ -118,20 +118,6 @@ fn read_resolved_event_into(c: &mut Criterion) {
 /// Benchmarks, how fast individual event parsed
 fn one_event(c: &mut Criterion) {
     let mut group = c.benchmark_group("One event");
-    group.bench_function("StartText", |b| {
-        let src = "Hello world!".repeat(512 / 12);
-        b.iter(|| {
-            let mut r = Reader::from_str(&src);
-            let mut nbtxt = criterion::black_box(0);
-            r.check_end_names(false).check_comments(false);
-            match r.read_event() {
-                Ok(Event::StartText(e)) => nbtxt += e.len(),
-                something_else => panic!("Did not expect {:?}", something_else),
-            };
-
-            assert_eq!(nbtxt, 504);
-        })
-    });
 
     group.bench_function("Start", |b| {
         let src = format!(r#"<hello target="{}">"#, "world".repeat(512 / 5));

diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -929,10 +929,6 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
         let event = loop {
             let e = self.reader.read_event_into(&mut self.buf)?;
             match e {
-                //TODO: Probably not the best idea treat StartText as usual text
-                // Usually this event will represent a BOM
-                // Changing this requires review of the serde-de::top_level::one_element test
-                Event::StartText(e) => break Ok(DeEvent::Text(e.into_owned().into())),
                 Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())),
                 Event::End(e) => break Ok(DeEvent::End(e.into_owned())),
                 Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())),
@@ -974,10 +970,6 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
         loop {
             let e = self.reader.read_event()?;
             match e {
-                //TODO: Probably not the best idea treat StartText as usual text
-                // Usually this event will represent a BOM
-                // Changing this requires review of the serde-de::top_level::one_element test
-                Event::StartText(e) => break Ok(DeEvent::Text(e.into())),
                 Event::Start(e) => break Ok(DeEvent::Start(e)),
                 Event::End(e) => break Ok(DeEvent::End(e)),
                 Event::Text(e) => break Ok(DeEvent::Text(e)),

diff --git a/src/events/mod.rs b/src/events/mod.rs
@@ -50,69 +50,6 @@ use crate::name::{LocalName, QName};
 use crate::utils::write_cow_string;
 use attributes::{Attribute, Attributes};
 
-/// Text that appeared before an XML declaration, a start element or a comment.
-///
-/// In well-formed XML it could contain a Byte-Order-Mark (BOM). If this event
-/// contains something else except BOM, the XML should be considered ill-formed.
-///
-/// This is a reader-only event. If you need to write a text before the first tag,
-/// use the [`BytesText`] event.
-#[derive(Debug, Clone, Eq, PartialEq)]
-pub struct BytesStartText<'a> {
-    content: BytesText<'a>,
-}
-
-impl<'a> BytesStartText<'a> {
-    /// Converts the event into an owned event.
-    pub fn into_owned(self) -> BytesStartText<'static> {
-        BytesStartText {
-            content: self.content.into_owned(),
-        }
-    }
-
-    /// Extracts the inner `Cow` from the `BytesStartText` event container.
-    #[inline]
-    pub fn into_inner(self) -> Cow<'a, [u8]> {
-        self.content.into_inner()
-    }
-
-    /// Converts the event into a borrowed event.
-    #[inline]
-    pub fn borrow(&self) -> BytesStartText {
-        BytesStartText {
-            content: self.content.borrow(),
-        }
-    }
-
-    /// Decodes bytes of event, stripping byte order mark (BOM) if it is presented
-    /// in the event.
-    ///
-    /// This method does not unescapes content, because no escape sequences can
-    /// appeared in the BOM or in the text before the first tag.
-    pub fn decode_with_bom_removal(&self) -> Result<String> {
-        //TODO: Fix lifetime issue - it should be possible to borrow string
-        let decoded = self.content.decoder.decode_with_bom_removal(&*self)?;
-
-        Ok(decoded.to_string())
-    }
-}
-
-impl<'a> Deref for BytesStartText<'a> {
-    type Target = BytesText<'a>;
-
-    fn deref(&self) -> &Self::Target {
-        &self.content
-    }
-}
-
-impl<'a> From<BytesText<'a>> for BytesStartText<'a> {
-    fn from(content: BytesText<'a>) -> Self {
-        Self { content }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
 /// Opening tag data (`Event::Start`), with optional attributes.
 ///
 /// `<name attr="value">`.
@@ -797,12 +734,6 @@ impl<'a> Deref for BytesText<'a> {
     }
 }
 
-impl<'a> From<BytesStartText<'a>> for BytesText<'a> {
-    fn from(content: BytesStartText<'a>) -> Self {
-        content.content
-    }
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// CDATA content contains unescaped data from the reader. If you want to write them as a text,
@@ -941,56 +872,6 @@ impl<'a> Deref for BytesCData<'a> {
 /// [`Reader::read_event_into`]: crate::reader::Reader::read_event_into
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub enum Event<'a> {
-    /// Text that appeared before the first opening tag or an [XML declaration].
-    /// [According to the XML standard][std], no text allowed before the XML
-    /// declaration. However, if there is a BOM in the stream, some data may be
-    /// present.
-    ///
-    /// When this event is generated, it is the very first event emitted by the
-    /// [`Reader`], and there can be the only one such event.
-    ///
-    /// The [`Writer`] writes content of this event "as is" without encoding or
-    /// escaping. If you write it, it should be written first and only one time
-    /// (but writer does not enforce that).
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # use pretty_assertions::assert_eq;
-    /// use std::borrow::Cow;
-    /// use quick_xml::events::Event;
-    /// use quick_xml::reader::Reader;
-    ///
-    /// // XML in UTF-8 with BOM
-    /// let xml = b"\xEF\xBB\xBF<?xml version='1.0'?>".as_ref();
-    /// let mut reader = Reader::from_reader(xml);
-    /// let mut buf = Vec::new();
-    /// let mut events_processed = 0;
-    /// loop {
-    ///     match reader.read_event_into(&mut buf) {
-    ///         Ok(Event::StartText(e)) => {
-    ///             assert_eq!(events_processed, 0);
-    ///             // Content contains BOM
-    ///             assert_eq!(e.into_inner(), Cow::Borrowed(b"\xEF\xBB\xBF"));
-    ///         }
-    ///         Ok(Event::Decl(_)) => {
-    ///             assert_eq!(events_processed, 1);
-    ///         }
-    ///         Ok(Event::Eof) => {
-    ///             assert_eq!(events_processed, 2);
-    ///             break;
-    ///         }
-    ///         e => panic!("Unexpected event {:?}", e),
-    ///     }
-    ///     events_processed += 1;
-    /// }
-    /// ```
-    ///
-    /// [XML declaration]: Event::Decl
-    /// [std]: https://www.w3.org/TR/xml11/#NT-document
-    /// [`Reader`]: crate::reader::Reader
-    /// [`Writer`]: crate::writer::Writer
-    StartText(BytesStartText<'a>),
     /// Start tag (with attributes) `<tag attr="value">`.
     Start(BytesStart<'a>),
     /// End tag `</tag>`.
@@ -1018,7 +899,6 @@ impl<'a> Event<'a> {
     /// buffer used when reading but incurring a new, separate allocation.
     pub fn into_owned(self) -> Event<'static> {
         match self {
-            Event::StartText(e) => Event::StartText(e.into_owned()),
             Event::Start(e) => Event::Start(e.into_owned()),
             Event::End(e) => Event::End(e.into_owned()),
             Event::Empty(e) => Event::Empty(e.into_owned()),
@@ -1036,7 +916,6 @@ impl<'a> Event<'a> {
     #[inline]
     pub fn borrow(&self) -> Event {
         match self {
-            Event::StartText(e) => Event::StartText(e.borrow()),
             Event::Start(e) => Event::Start(e.borrow()),
             Event::End(e) => Event::End(e.borrow()),
             Event::Empty(e) => Event::Empty(e.borrow()),
@@ -1056,7 +935,6 @@ impl<'a> Deref for Event<'a> {
 
     fn deref(&self) -> &[u8] {
         match *self {
-            Event::StartText(ref e) => &*e,
             Event::Start(ref e) | Event::Empty(ref e) => &*e,
             Event::End(ref e) => &*e,
             Event::Text(ref e) => &*e,

diff --git a/src/reader/mod.rs b/src/reader/mod.rs
@@ -286,7 +286,7 @@ pub type Span = Range<usize>;
 ///   subgraph _
 ///     direction LR
 ///
-///     Init      -- "(no event)"\nStartText                              --> OpenedTag
+///     Init      -- "(no event)"\n                                       --> OpenedTag
 ///     OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag
 ///     ClosedTag -- "#lt;false#gt;\n(no event)"\nText                    --> OpenedTag
 ///   end
@@ -297,13 +297,13 @@ pub type Span = Range<usize>;
 #[derive(Clone)]
 enum ParseState {
     /// Initial state in which reader stay after creation. Transition from that
-    /// state could produce a `StartText`, `Decl`, `Comment` or `Start` event.
-    /// The next state is always `OpenedTag`. The reader will never return to this
-    /// state. The event emitted during transition to `OpenedTag` is a `StartEvent`
-    /// if the first symbol not `<`, otherwise no event are emitted.
+    /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
+    /// state is always `OpenedTag`. The reader will never return to this state. The
+    /// event emitted during transition to `OpenedTag` is a `StartEvent` if the
+    /// first symbol not `<`, otherwise no event are emitted.
     Init,
     /// State after seeing the `<` symbol. Depending on the next symbol all other
-    /// events (except `StartText`) could be generated.
+    /// events could be generated.
     ///
     /// After generating one event the reader moves to the `ClosedTag` state.
     OpenedTag,
@@ -553,8 +553,6 @@ impl<R> Reader<R> {
     }
 
     /// Read until '<' is found and moves reader to an `OpenedTag` state.
-    ///
-    /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise
     fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result<Event<'i>>
     where
         R: XmlSource<'i, B>,
@@ -1573,16 +1571,6 @@ mod test {
                 use crate::reader::Reader;
                 use pretty_assertions::assert_eq;
 
-                #[$test]
-                $($async)? fn start_text() {
-                    let mut reader = Reader::from_str("bom");
-
-                    assert_eq!(
-                        reader.$read_event($buf) $(.$await)? .unwrap(),
-                        Event::StartText(BytesText::from_escaped("bom").into())
-                    );
-                }
-
                 #[$test]
                 $($async)? fn declaration() {
                     let mut reader = Reader::from_str("<?xml ?>");

diff --git a/src/reader/parser.rs b/src/reader/parser.rs
@@ -61,17 +61,11 @@ pub(super) struct Parser {
 }
 
 impl Parser {
-    /// Trims whitespaces from `bytes`, if required, and returns a [`StartText`]
-    /// or a [`Text`] event. When [`StartText`] is returned, the method can change
-    /// the encoding of the reader, detecting it from the beginning of the stream.
+    /// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event.
     ///
     /// # Parameters
     /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
-    /// - `first`: if `true`, then this is the first call of that function,
-    ///   i. e. data from the start of stream and [`StartText`] will be returned,
-    ///   otherwise [`Text`] will be returned
     ///
-    /// [`StartText`]: Event::StartText
     /// [`Text`]: Event::Text
     pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
         #[cfg(feature = "encoding")]
@@ -91,12 +85,7 @@ impl Parser {
         } else {
             bytes
         };
-
-        Ok(if first {
-            Event::StartText(BytesText::wrap(content, self.decoder()).into())
-        } else {
-            Event::Text(BytesText::wrap(content, self.decoder()))
-        })
+        Ok(Event::Text(BytesText::wrap(content, self.decoder())))
     }
 
     /// reads `BytesElement` starting with a `!`,

diff --git a/src/writer.rs b/src/writer.rs
@@ -133,7 +133,6 @@ impl<W: Write> Writer<W> {
     pub fn write_event<'a, E: AsRef<Event<'a>>>(&mut self, event: E) -> Result<()> {
         let mut next_should_line_break = true;
         let result = match *event.as_ref() {
-            Event::StartText(ref e) => self.write(&e),
             Event::Start(ref e) => {
                 let result = self.write_wrapped(b"<", e, b">");
                 if let Some(i) = self.indent.as_mut() {

diff --git a/tests/test.rs b/tests/test.rs
@@ -111,7 +111,7 @@ fn test_issue94() {
 fn test_no_trim() {
     let mut reader = Reader::from_str(" <tag> text </tag> ");
 
-    assert!(matches!(reader.read_event().unwrap(), StartText(_)));
+    assert!(matches!(reader.read_event().unwrap(), Text(_)));
     assert!(matches!(reader.read_event().unwrap(), Start(_)));
     assert!(matches!(reader.read_event().unwrap(), Text(_)));
     assert!(matches!(reader.read_event().unwrap(), End(_)));
@@ -123,7 +123,7 @@ fn test_trim_end() {
     let mut reader = Reader::from_str(" <tag> text </tag> ");
     reader.trim_text_end(true);
 
-    assert!(matches!(reader.read_event().unwrap(), StartText(_)));
+    assert!(matches!(reader.read_event().unwrap(), Text(_)));
     assert!(matches!(reader.read_event().unwrap(), Start(_)));
     assert!(matches!(reader.read_event().unwrap(), Text(_)));
     assert!(matches!(reader.read_event().unwrap(), End(_)));