quick_xml/
encoding.rs

1//! A module for wrappers that encode / decode data.
2
3use std::borrow::Cow;
4use std::str::Utf8Error;
5
6#[cfg(feature = "encoding")]
7use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_16LE, UTF_8};
8
9/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
10/// See <https://unicode.org/faq/utf_bom.html#bom1>
11pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
12/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with little-endian byte order.
13/// See <https://unicode.org/faq/utf_bom.html#bom1>
14#[cfg(feature = "encoding")]
15pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
16/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with big-endian byte order.
17/// See <https://unicode.org/faq/utf_bom.html#bom1>
18#[cfg(feature = "encoding")]
19pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
20
21/// An error when decoding or encoding
22///
23/// If feature [`encoding`] is disabled, the [`EncodingError`] is always [`EncodingError::Utf8`]
24///
25/// [`encoding`]: ../index.html#encoding
26#[derive(Clone, Debug, PartialEq, Eq)]
27#[non_exhaustive]
28pub enum EncodingError {
29    /// Input was not valid UTF-8
30    Utf8(Utf8Error),
31    /// Input did not adhere to the given encoding
32    #[cfg(feature = "encoding")]
33    Other(&'static Encoding),
34}
35
36impl From<Utf8Error> for EncodingError {
37    #[inline]
38    fn from(e: Utf8Error) -> Self {
39        Self::Utf8(e)
40    }
41}
42
43impl std::error::Error for EncodingError {
44    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
45        match self {
46            Self::Utf8(e) => Some(e),
47            #[cfg(feature = "encoding")]
48            Self::Other(_) => None,
49        }
50    }
51}
52
53impl std::fmt::Display for EncodingError {
54    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55        match self {
56            Self::Utf8(e) => write!(f, "cannot decode input using UTF-8: {}", e),
57            #[cfg(feature = "encoding")]
58            Self::Other(encoding) => write!(f, "cannot decode input using {}", encoding.name()),
59        }
60    }
61}
62
63/// Decoder of byte slices into strings.
64///
65/// If feature [`encoding`] is enabled, this encoding taken from the `"encoding"`
66/// XML declaration or assumes UTF-8, if XML has no <?xml ?> declaration, encoding
67/// key is not defined or contains unknown encoding.
68///
69/// The library supports any UTF-8 compatible encodings that crate `encoding_rs`
70/// is supported. [*UTF-16 and ISO-2022-JP are not supported at the present*][utf16].
71///
72/// If feature [`encoding`] is disabled, the decoder is always UTF-8 decoder:
73/// any XML declarations are ignored.
74///
75/// [utf16]: https://github.com/tafia/quick-xml/issues/158
76/// [`encoding`]: ../index.html#encoding
77#[derive(Clone, Copy, Debug, Eq, PartialEq)]
78pub struct Decoder {
79    #[cfg(feature = "encoding")]
80    pub(crate) encoding: &'static Encoding,
81}
82
83impl Decoder {
84    pub(crate) const fn utf8() -> Self {
85        Decoder {
86            #[cfg(feature = "encoding")]
87            encoding: UTF_8,
88        }
89    }
90
91    #[cfg(all(test, feature = "encoding", feature = "serialize"))]
92    pub(crate) const fn utf16() -> Self {
93        Decoder { encoding: UTF_16LE }
94    }
95}
96
97impl Decoder {
98    /// Returns the `Reader`s encoding.
99    ///
100    /// This encoding will be used by [`decode`].
101    ///
102    /// [`decode`]: Self::decode
103    #[cfg(feature = "encoding")]
104    pub const fn encoding(&self) -> &'static Encoding {
105        self.encoding
106    }
107
108    /// ## Without `encoding` feature
109    ///
110    /// Decodes an UTF-8 slice regardless of XML declaration and ignoring BOM
111    /// if it is present in the `bytes`.
112    ///
113    /// ## With `encoding` feature
114    ///
115    /// Decodes specified bytes using encoding, declared in the XML, if it was
116    /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
117    /// in the `bytes`.
118    ///
119    /// ----
120    /// Returns an error in case of malformed sequences in the `bytes`.
121    pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>, EncodingError> {
122        #[cfg(not(feature = "encoding"))]
123        let decoded = Ok(Cow::Borrowed(std::str::from_utf8(bytes)?));
124
125        #[cfg(feature = "encoding")]
126        let decoded = decode(bytes, self.encoding);
127
128        decoded
129    }
130
131    /// Like [`decode`][Self::decode] but using a pre-allocated buffer.
132    pub fn decode_into(&self, bytes: &[u8], buf: &mut String) -> Result<(), EncodingError> {
133        #[cfg(not(feature = "encoding"))]
134        buf.push_str(std::str::from_utf8(bytes)?);
135
136        #[cfg(feature = "encoding")]
137        decode_into(bytes, self.encoding, buf)?;
138
139        Ok(())
140    }
141
142    /// Decodes the `Cow` buffer, preserves the lifetime
143    pub(crate) fn decode_cow<'b>(
144        &self,
145        bytes: &Cow<'b, [u8]>,
146    ) -> Result<Cow<'b, str>, EncodingError> {
147        match bytes {
148            Cow::Borrowed(bytes) => self.decode(bytes),
149            // Convert to owned, because otherwise Cow will be bound with wrong lifetime
150            Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()),
151        }
152    }
153
154    /// Decodes the `Cow` buffer, normalizes XML EOLs, preserves the lifetime
155    pub(crate) fn content<'b>(
156        &self,
157        bytes: &Cow<'b, [u8]>,
158        normalize_eol: impl Fn(&str) -> Cow<str>,
159    ) -> Result<Cow<'b, str>, EncodingError> {
160        match bytes {
161            Cow::Borrowed(bytes) => {
162                let text = self.decode(bytes)?;
163                match normalize_eol(&text) {
164                    // If text borrowed after normalization that means that it's not changed
165                    Cow::Borrowed(_) => Ok(text),
166                    Cow::Owned(s) => Ok(Cow::Owned(s)),
167                }
168            }
169            Cow::Owned(bytes) => {
170                let text = self.decode(bytes)?;
171                let text = normalize_eol(&text);
172                // Convert to owned, because otherwise Cow will be bound with wrong lifetime
173                Ok(text.into_owned().into())
174            }
175        }
176    }
177}
178
179/// Decodes the provided bytes using the specified encoding.
180///
181/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
182#[cfg(feature = "encoding")]
183pub fn decode<'b>(
184    bytes: &'b [u8],
185    encoding: &'static Encoding,
186) -> Result<Cow<'b, str>, EncodingError> {
187    encoding
188        .decode_without_bom_handling_and_without_replacement(bytes)
189        .ok_or(EncodingError::Other(encoding))
190}
191
192/// Like [`decode`] but using a pre-allocated buffer.
193#[cfg(feature = "encoding")]
194pub fn decode_into(
195    bytes: &[u8],
196    encoding: &'static Encoding,
197    buf: &mut String,
198) -> Result<(), EncodingError> {
199    if encoding == UTF_8 {
200        buf.push_str(std::str::from_utf8(bytes)?);
201        return Ok(());
202    }
203
204    let mut decoder = encoding.new_decoder_without_bom_handling();
205    buf.reserve(
206        decoder
207            .max_utf8_buffer_length_without_replacement(bytes.len())
208            // SAFETY: None can be returned only if required size will overflow usize,
209            // but in that case String::reserve also panics
210            .unwrap(),
211    );
212    let (result, read) = decoder.decode_to_string_without_replacement(bytes, buf, true);
213    match result {
214        DecoderResult::InputEmpty => {
215            debug_assert_eq!(read, bytes.len());
216            Ok(())
217        }
218        DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)),
219        // SAFETY: We allocate enough space above
220        DecoderResult::OutputFull => unreachable!(),
221    }
222}
223
224/// Automatic encoding detection of XML files based using the
225/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
226///
227/// If encoding is detected, `Some` is returned with an encoding and size of BOM
228/// in bytes, if detection was performed using BOM, or zero, if detection was
229/// performed without BOM.
230///
231/// IF encoding was not recognized, `None` is returned.
232///
233/// Because the [`encoding_rs`] crate supports only subset of those encodings, only
234/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
235///
236/// The algorithm suggests examine up to the first 4 bytes to determine encoding
237/// according to the following table:
238///
239/// | Bytes       |Detected encoding
240/// |-------------|------------------------------------------
241/// | **BOM**
242/// |`FE_FF_##_##`|UTF-16, big-endian
243/// |`FF FE ## ##`|UTF-16, little-endian
244/// |`EF BB BF`   |UTF-8
245/// | **No BOM**
246/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
247/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
248/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
249#[cfg(feature = "encoding")]
250pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> {
251    match bytes {
252        // with BOM
253        _ if bytes.starts_with(UTF16_BE_BOM) => Some((UTF_16BE, 2)),
254        _ if bytes.starts_with(UTF16_LE_BOM) => Some((UTF_16LE, 2)),
255        _ if bytes.starts_with(UTF8_BOM) => Some((UTF_8, 3)),
256
257        // without BOM
258        _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some((UTF_16BE, 0)), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
259        _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some((UTF_16LE, 0)), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
260        _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some((UTF_8, 0)), // Some ASCII compatible
261
262        _ => None,
263    }
264}