From 9598c37dc047e09bbc08911489f170146d8dac05 Mon Sep 17 00:00:00 2001 From: Mingun Date: Mon, 22 Aug 2022 18:09:31 +0500 Subject: [PATCH] Add warning about unsupported encodings --- Cargo.toml | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- src/encoding.rs | 2 +- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ced1ea3c..91cfd2f6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,10 +53,55 @@ async-tokio = ["tokio"] ## Currently, only ASCII-compatible encodings are supported, so, for example, ## UTF-16 will not work (therefore, `quick-xml` is not [standard compliant]). ## -## List of supported encodings includes all encodings supported by [`encoding_rs`] -## crate, that satisfied the restriction above. +## Thus, quick-xml supports all encodings of [`encoding_rs`] except these: +## - [UTF-16BE] +## - [UTF-16LE] +## - [ISO-2022-JP] +## +## You should stop to process document when one of that encoding will be detected, +## because generated events can be wrong and do not reflect a real document structure! +## +## Because there is only supported encodings that is not ASCII compatible, you can +## check for that to detect them: +## +## ``` +## use quick_xml::events::Event; +## use quick_xml::reader::Reader; +## +## # fn to_utf16le_with_bom(string: &str) -> Vec { +## # let mut bytes = Vec::new(); +## # bytes.extend_from_slice(&[0xFF, 0xFE]); // UTF-16 LE BOM +## # for ch in string.encode_utf16() { +## # bytes.extend_from_slice(&ch.to_le_bytes()); +## # } +## # bytes +## # } +## let xml = to_utf16le_with_bom(r#""#); +## let mut reader = Reader::from_reader(xml.as_ref()); +## reader.trim_text(true); +## +## let mut buf = Vec::new(); +## let mut unsupported = false; +## loop { +## if !reader.decoder().encoding().is_ascii_compatible() { +## unsupported = true; +## break; +## } +## buf.clear(); +## match reader.read_event_into(&mut buf).unwrap() { +## Event::Eof => break, +## _ => {} +## } +## } +## assert_eq!(unsupported, true); +## ``` +## That restriction will be eliminated once issue [#158] is resolved. ## ## [standard compliant]: https://www.w3.org/TR/xml11/#charencoding +## [UTF-16BE]: encoding_rs::UTF_16BE +## [UTF-16LE]: encoding_rs::UTF_16LE +## [ISO-2022-JP]: encoding_rs::ISO_2022_JP +## [#158]: https://github.com/tafia/quick-xml/issues/158 encoding = ["encoding_rs"] ## Enables support for recognizing all [HTML 5 entities] in [`unescape`] and diff --git a/src/encoding.rs b/src/encoding.rs index 6568812f..2f549d88 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -28,7 +28,7 @@ pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF]; /// key is not defined or contains unknown encoding. /// /// The library supports any UTF-8 compatible encodings that crate `encoding_rs` -/// is supported. [*UTF-16 is not supported at the present*][utf16]. +/// is supported. [*UTF-16 and ISO-2022-JP are not supported at the present*][utf16]. /// /// If feature `encoding` is disabled, the decoder is always UTF-8 decoder: /// any XML declarations are ignored.