quick_xml/reader/
buffered_reader.rs

1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use crate::errors::{Error, Result};
9use crate::events::Event;
10use crate::name::QName;
11use crate::parser::Parser;
12use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource};
13use crate::utils::is_whitespace;
14
15macro_rules! impl_buffered_source {
16    ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
17        #[cfg(not(feature = "encoding"))]
18        #[inline]
19        $($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
20            use crate::encoding::UTF8_BOM;
21
22            loop {
23                break match self $(.$reader)? .fill_buf() $(.$await)? {
24                    Ok(n) => {
25                        if n.starts_with(UTF8_BOM) {
26                            self $(.$reader)? .consume(UTF8_BOM.len());
27                        }
28                        Ok(())
29                    },
30                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
31                    Err(e) => Err(e),
32                };
33            }
34        }
35
36        #[cfg(feature = "encoding")]
37        #[inline]
38        $($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
39            loop {
40                break match self $(.$reader)? .fill_buf() $(.$await)? {
41                    Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
42                        self $(.$reader)? .consume(bom_len);
43                        Ok(Some(enc))
44                    } else {
45                        Ok(None)
46                    },
47                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
48                    Err(e) => Err(e),
49                };
50            }
51        }
52
53        #[inline]
54        $($async)? fn read_text $(<$lf>)? (
55            &mut self,
56            buf: &'b mut Vec<u8>,
57            position: &mut u64,
58        ) -> ReadTextResult<'b, &'b mut Vec<u8>> {
59            let mut read = 0;
60            let start = buf.len();
61            loop {
62                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
63                    Ok(n) if n.is_empty() => break,
64                    Ok(n) => n,
65                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
66                    Err(e) => {
67                        *position += read;
68                        return ReadTextResult::Err(e);
69                    }
70                };
71
72                // Search for start of markup or an entity or character reference
73                match memchr::memchr2(b'<', b'&', available) {
74                    // Special handling is needed only on the first iteration.
75                    // On next iterations we already read something and should emit Text event
76                    Some(0) if read == 0 && available[0] == b'<' => {
77                        self $(.$reader)? .consume(1);
78                        *position += 1;
79                        return ReadTextResult::Markup(buf);
80                    }
81                    // Do not consume `&` because it may be lone and we would be need to
82                    // return it as part of Text event
83                    Some(0) if read == 0 => return ReadTextResult::Ref(buf),
84                    Some(i) if available[i] == b'<' => {
85                        buf.extend_from_slice(&available[..i]);
86
87                        // +1 to skip `<`
88                        let used = i + 1;
89                        self $(.$reader)? .consume(used);
90                        read += used as u64;
91
92                        *position += read;
93                        return ReadTextResult::UpToMarkup(&buf[start..]);
94                    }
95                    Some(i) => {
96                        buf.extend_from_slice(&available[..i]);
97
98                        self $(.$reader)? .consume(i);
99                        read += i as u64;
100
101                        *position += read;
102                        return ReadTextResult::UpToRef(&buf[start..]);
103                    }
104                    None => {
105                        buf.extend_from_slice(available);
106
107                        let used = available.len();
108                        self $(.$reader)? .consume(used);
109                        read += used as u64;
110                    }
111                }
112            }
113
114            *position += read;
115            ReadTextResult::UpToEof(&buf[start..])
116        }
117
118        #[inline]
119        $($async)? fn read_ref $(<$lf>)? (
120            &mut self,
121            buf: &'b mut Vec<u8>,
122            position: &mut u64,
123        ) -> ReadRefResult<'b> {
124            let mut read = 0;
125            let start = buf.len();
126            loop {
127                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
128                    Ok(n) if n.is_empty() => break,
129                    Ok(n) => n,
130                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
131                    Err(e) => {
132                        *position += read;
133                        return ReadRefResult::Err(e);
134                    }
135                };
136                // `read_ref` called when the first character is `&`, so we
137                // should explicitly skip it at first iteration lest we confuse
138                // it with the end
139                if read == 0 {
140                    debug_assert_eq!(
141                        available.first(),
142                        Some(&b'&'),
143                        "`read_ref` must be called at `&`"
144                    );
145                    // If that ampersand is lone, then it will be part of text
146                    // and we should keep it
147                    buf.push(b'&');
148                    self $(.$reader)? .consume(1);
149                    read += 1;
150                    continue;
151                }
152
153                match memchr::memchr3(b';', b'&', b'<', available) {
154                    // Do not consume `&` because it may be lone and we would be need to
155                    // return it as part of Text event
156                    Some(i) if available[i] == b'&' => {
157                        buf.extend_from_slice(&available[..i]);
158
159                        self $(.$reader)? .consume(i);
160                        read += i as u64;
161
162                        *position += read;
163
164                        return ReadRefResult::UpToRef(&buf[start..]);
165                    }
166                    Some(i) => {
167                        let is_end = available[i] == b';';
168                        buf.extend_from_slice(&available[..i]);
169
170                        // +1 -- skip the end `;` or `<`
171                        let used = i + 1;
172                        self $(.$reader)? .consume(used);
173                        read += used as u64;
174
175                        *position += read;
176
177                        return if is_end {
178                            ReadRefResult::Ref(&buf[start..])
179                        } else {
180                            ReadRefResult::UpToMarkup(&buf[start..])
181                        };
182                    }
183                    None => {
184                        buf.extend_from_slice(available);
185
186                        let used = available.len();
187                        self $(.$reader)? .consume(used);
188                        read += used as u64;
189                    }
190                }
191            }
192
193            *position += read;
194            ReadRefResult::UpToEof(&buf[start..])
195        }
196
197        #[inline]
198        $($async)? fn read_with<$($lf,)? P: Parser>(
199            &mut self,
200            mut parser: P,
201            buf: &'b mut Vec<u8>,
202            position: &mut u64,
203        ) -> Result<&'b [u8]> {
204            let mut read = 0;
205            let start = buf.len();
206            loop {
207                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
208                    Ok(n) if n.is_empty() => break,
209                    Ok(n) => n,
210                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
211                    Err(e) => {
212                        *position += read;
213                        return Err(Error::Io(e.into()));
214                    }
215                };
216
217                if let Some(i) = parser.feed(available) {
218                    buf.extend_from_slice(&available[..i]);
219
220                    // +1 for `>` which we do not include
221                    self $(.$reader)? .consume(i + 1);
222                    read += i as u64 + 1;
223
224                    *position += read;
225                    return Ok(&buf[start..]);
226                }
227
228                // The `>` symbol not yet found, continue reading
229                buf.extend_from_slice(available);
230
231                let used = available.len();
232                self $(.$reader)? .consume(used);
233                read += used as u64;
234            }
235
236            *position += read;
237            Err(Error::Syntax(P::eof_error()))
238        }
239
240        #[inline]
241        $($async)? fn read_bang_element $(<$lf>)? (
242            &mut self,
243            buf: &'b mut Vec<u8>,
244            position: &mut u64,
245        ) -> Result<(BangType, &'b [u8])> {
246            // Peeked one bang ('!') before being called, so it's guaranteed to
247            // start with it.
248            let start = buf.len();
249            let mut read = 1;
250            buf.push(b'!');
251            self $(.$reader)? .consume(1);
252
253            let mut bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
254
255            loop {
256                match self $(.$reader)? .fill_buf() $(.$await)? {
257                    // Note: Do not update position, so the error points to
258                    // somewhere sane rather than at the EOF
259                    Ok(n) if n.is_empty() => break,
260                    Ok(available) => {
261                        // We only parse from start because we don't want to consider
262                        // whatever is in the buffer before the bang element
263                        if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
264                            buf.extend_from_slice(consumed);
265
266                            self $(.$reader)? .consume(used);
267                            read += used as u64;
268
269                            *position += read;
270                            return Ok((bang_type, &buf[start..]));
271                        } else {
272                            buf.extend_from_slice(available);
273
274                            let used = available.len();
275                            self $(.$reader)? .consume(used);
276                            read += used as u64;
277                        }
278                    }
279                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
280                    Err(e) => {
281                        *position += read;
282                        return Err(Error::Io(e.into()));
283                    }
284                }
285            }
286
287            *position += read;
288            Err(bang_type.to_err().into())
289        }
290
291        #[inline]
292        $($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
293            loop {
294                break match self $(.$reader)? .fill_buf() $(.$await)? {
295                    Ok(n) => {
296                        let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
297                        if count > 0 {
298                            self $(.$reader)? .consume(count);
299                            *position += count as u64;
300                            continue;
301                        } else {
302                            Ok(())
303                        }
304                    }
305                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
306                    Err(e) => Err(e),
307                };
308            }
309        }
310
311        #[inline]
312        $($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
313            loop {
314                break match self $(.$reader)? .fill_buf() $(.$await)? {
315                    Ok(n) => Ok(n.first().cloned()),
316                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
317                    Err(e) => Err(e),
318                };
319            }
320        }
321    };
322}
323
324// Make it public for use in async implementations.
325// New rustc reports
326// > warning: the item `impl_buffered_source` is imported redundantly
327// so make it public only when async feature is enabled
328#[cfg(feature = "async-tokio")]
329pub(super) use impl_buffered_source;
330
331/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
332/// `Vec<u8>` as buffer that will be borrowed by events.
333impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
334    impl_buffered_source!();
335}
336
337////////////////////////////////////////////////////////////////////////////////////////////////////
338
339/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
340impl<R: BufRead> Reader<R> {
341    /// Reads the next `Event`.
342    ///
343    /// This is the main entry point for reading XML `Event`s.
344    ///
345    /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
346    /// internally).
347    ///
348    /// Having the possibility to control the internal buffers gives you some additional benefits
349    /// such as:
350    ///
351    /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
352    ///   you can call `buf.clear()` once you are done with processing the event (typically at the
353    ///   end of your loop).
354    /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
355    ///
356    /// # Examples
357    ///
358    /// ```
359    /// # use pretty_assertions::assert_eq;
360    /// use quick_xml::events::Event;
361    /// use quick_xml::reader::Reader;
362    ///
363    /// let xml = r#"<tag1 att1 = "test">
364    ///                 <tag2><!--Test comment-->Test</tag2>
365    ///                 <tag2>Test 2</tag2>
366    ///              </tag1>"#;
367    /// let mut reader = Reader::from_str(xml);
368    /// reader.config_mut().trim_text(true);
369    /// let mut count = 0;
370    /// let mut buf = Vec::new();
371    /// let mut txt = Vec::new();
372    /// loop {
373    ///     match reader.read_event_into(&mut buf) {
374    ///         Ok(Event::Start(_)) => count += 1,
375    ///         Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
376    ///         Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
377    ///         Ok(Event::Eof) => break,
378    ///         _ => (),
379    ///     }
380    ///     buf.clear();
381    /// }
382    /// assert_eq!(count, 3);
383    /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
384    /// ```
385    #[inline]
386    pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
387        self.read_event_impl(buf)
388    }
389
390    /// Reads until end element is found using provided buffer as intermediate
391    /// storage for events content. This function is supposed to be called after
392    /// you already read a [`Start`] event.
393    ///
394    /// Returns a span that cover content between `>` of an opening tag and `<` of
395    /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
396    /// this method was called after reading expanded [`Start`] event.
397    ///
398    /// Manages nested cases where parent and child elements have the _literally_
399    /// same name.
400    ///
401    /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
402    /// will be returned. In particularly, that error will be returned if you call
403    /// this method without consuming the corresponding [`Start`] event first.
404    ///
405    /// If your reader created from a string slice or byte array slice, it is
406    /// better to use [`read_to_end()`] method, because it will not copy bytes
407    /// into intermediate buffer.
408    ///
409    /// The provided `buf` buffer will be filled only by one event content at time.
410    /// Before reading of each event the buffer will be cleared. If you know an
411    /// appropriate size of each event, you can preallocate the buffer to reduce
412    /// number of reallocations.
413    ///
414    /// The `end` parameter should contain name of the end element _in the reader
415    /// encoding_. It is good practice to always get that parameter using
416    /// [`BytesStart::to_end()`] method.
417    ///
418    /// The correctness of the skipped events does not checked, if you disabled
419    /// the [`check_end_names`] option.
420    ///
421    /// # Namespaces
422    ///
423    /// While the `Reader` does not support namespace resolution, namespaces
424    /// does not change the algorithm for comparing names. Although the names
425    /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
426    /// same namespace, are semantically equivalent, `</b:name>` cannot close
427    /// `<a:name>`, because according to [the specification]
428    ///
429    /// > The end of every element that begins with a **start-tag** MUST be marked
430    /// > by an **end-tag** containing a name that echoes the element's type as
431    /// > given in the **start-tag**
432    ///
433    /// # Examples
434    ///
435    /// This example shows, how you can skip XML content after you read the
436    /// start event.
437    ///
438    /// ```
439    /// # use pretty_assertions::assert_eq;
440    /// use quick_xml::events::{BytesStart, Event};
441    /// use quick_xml::reader::Reader;
442    ///
443    /// let mut reader = Reader::from_str(r#"
444    ///     <outer>
445    ///         <inner>
446    ///             <inner></inner>
447    ///             <inner/>
448    ///             <outer></outer>
449    ///             <outer/>
450    ///         </inner>
451    ///     </outer>
452    /// "#);
453    /// reader.config_mut().trim_text(true);
454    /// let mut buf = Vec::new();
455    ///
456    /// let start = BytesStart::new("outer");
457    /// let end   = start.to_end().into_owned();
458    ///
459    /// // First, we read a start event...
460    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
461    ///
462    /// // ...then, we could skip all events to the corresponding end event.
463    /// // This call will correctly handle nested <outer> elements.
464    /// // Note, however, that this method does not handle namespaces.
465    /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
466    ///
467    /// // At the end we should get an Eof event, because we ate the whole XML
468    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
469    /// ```
470    ///
471    /// [`Start`]: Event::Start
472    /// [`End`]: Event::End
473    /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
474    /// [`read_to_end()`]: Self::read_to_end
475    /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
476    /// [`check_end_names`]: crate::reader::Config::check_end_names
477    /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
478    pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
479        Ok(read_to_end!(self, end, buf, read_event_impl, {
480            buf.clear();
481        }))
482    }
483}
484
485impl Reader<BufReader<File>> {
486    /// Creates an XML reader from a file path.
487    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
488        let file = File::open(path)?;
489        let reader = BufReader::new(file);
490        Ok(Self::from_reader(reader))
491    }
492}
493
494#[cfg(test)]
495mod test {
496    use crate::reader::test::check;
497    use crate::reader::XmlSource;
498
499    /// Default buffer constructor just pass the byte array from the test
500    fn identity<T>(input: T) -> T {
501        input
502    }
503
504    check!(
505        #[test]
506        read_event_impl,
507        read_until_close,
508        identity,
509        &mut Vec::new()
510    );
511}