quick_xml/reader/buffered_reader.rs
1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use crate::errors::{Error, Result};
9use crate::events::Event;
10use crate::name::QName;
11use crate::parser::Parser;
12use crate::reader::{BangType, ReadRefResult, ReadTextResult, Reader, Span, XmlSource};
13use crate::utils::is_whitespace;
14
15macro_rules! impl_buffered_source {
16 ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
17 #[cfg(not(feature = "encoding"))]
18 #[inline]
19 $($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
20 use crate::encoding::UTF8_BOM;
21
22 loop {
23 break match self $(.$reader)? .fill_buf() $(.$await)? {
24 Ok(n) => {
25 if n.starts_with(UTF8_BOM) {
26 self $(.$reader)? .consume(UTF8_BOM.len());
27 }
28 Ok(())
29 },
30 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
31 Err(e) => Err(e),
32 };
33 }
34 }
35
36 #[cfg(feature = "encoding")]
37 #[inline]
38 $($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
39 loop {
40 break match self $(.$reader)? .fill_buf() $(.$await)? {
41 Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
42 self $(.$reader)? .consume(bom_len);
43 Ok(Some(enc))
44 } else {
45 Ok(None)
46 },
47 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
48 Err(e) => Err(e),
49 };
50 }
51 }
52
53 #[inline]
54 $($async)? fn read_text $(<$lf>)? (
55 &mut self,
56 buf: &'b mut Vec<u8>,
57 position: &mut u64,
58 ) -> ReadTextResult<'b, &'b mut Vec<u8>> {
59 let mut read = 0;
60 let start = buf.len();
61 loop {
62 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
63 Ok(n) if n.is_empty() => break,
64 Ok(n) => n,
65 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
66 Err(e) => {
67 *position += read;
68 return ReadTextResult::Err(e);
69 }
70 };
71
72 // Search for start of markup or an entity or character reference
73 match memchr::memchr2(b'<', b'&', available) {
74 // Special handling is needed only on the first iteration.
75 // On next iterations we already read something and should emit Text event
76 Some(0) if read == 0 && available[0] == b'<' => {
77 self $(.$reader)? .consume(1);
78 *position += 1;
79 return ReadTextResult::Markup(buf);
80 }
81 // Do not consume `&` because it may be lone and we would be need to
82 // return it as part of Text event
83 Some(0) if read == 0 => return ReadTextResult::Ref(buf),
84 Some(i) if available[i] == b'<' => {
85 buf.extend_from_slice(&available[..i]);
86
87 // +1 to skip `<`
88 let used = i + 1;
89 self $(.$reader)? .consume(used);
90 read += used as u64;
91
92 *position += read;
93 return ReadTextResult::UpToMarkup(&buf[start..]);
94 }
95 Some(i) => {
96 buf.extend_from_slice(&available[..i]);
97
98 self $(.$reader)? .consume(i);
99 read += i as u64;
100
101 *position += read;
102 return ReadTextResult::UpToRef(&buf[start..]);
103 }
104 None => {
105 buf.extend_from_slice(available);
106
107 let used = available.len();
108 self $(.$reader)? .consume(used);
109 read += used as u64;
110 }
111 }
112 }
113
114 *position += read;
115 ReadTextResult::UpToEof(&buf[start..])
116 }
117
118 #[inline]
119 $($async)? fn read_ref $(<$lf>)? (
120 &mut self,
121 buf: &'b mut Vec<u8>,
122 position: &mut u64,
123 ) -> ReadRefResult<'b> {
124 let mut read = 0;
125 let start = buf.len();
126 loop {
127 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
128 Ok(n) if n.is_empty() => break,
129 Ok(n) => n,
130 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
131 Err(e) => {
132 *position += read;
133 return ReadRefResult::Err(e);
134 }
135 };
136 // `read_ref` called when the first character is `&`, so we
137 // should explicitly skip it at first iteration lest we confuse
138 // it with the end
139 if read == 0 {
140 debug_assert_eq!(
141 available.first(),
142 Some(&b'&'),
143 "`read_ref` must be called at `&`"
144 );
145 // If that ampersand is lone, then it will be part of text
146 // and we should keep it
147 buf.push(b'&');
148 self $(.$reader)? .consume(1);
149 read += 1;
150 continue;
151 }
152
153 match memchr::memchr3(b';', b'&', b'<', available) {
154 // Do not consume `&` because it may be lone and we would be need to
155 // return it as part of Text event
156 Some(i) if available[i] == b'&' => {
157 buf.extend_from_slice(&available[..i]);
158
159 self $(.$reader)? .consume(i);
160 read += i as u64;
161
162 *position += read;
163
164 return ReadRefResult::UpToRef(&buf[start..]);
165 }
166 Some(i) => {
167 let is_end = available[i] == b';';
168 buf.extend_from_slice(&available[..i]);
169
170 // +1 -- skip the end `;` or `<`
171 let used = i + 1;
172 self $(.$reader)? .consume(used);
173 read += used as u64;
174
175 *position += read;
176
177 return if is_end {
178 ReadRefResult::Ref(&buf[start..])
179 } else {
180 ReadRefResult::UpToMarkup(&buf[start..])
181 };
182 }
183 None => {
184 buf.extend_from_slice(available);
185
186 let used = available.len();
187 self $(.$reader)? .consume(used);
188 read += used as u64;
189 }
190 }
191 }
192
193 *position += read;
194 ReadRefResult::UpToEof(&buf[start..])
195 }
196
197 #[inline]
198 $($async)? fn read_with<$($lf,)? P: Parser>(
199 &mut self,
200 mut parser: P,
201 buf: &'b mut Vec<u8>,
202 position: &mut u64,
203 ) -> Result<&'b [u8]> {
204 let mut read = 0;
205 let start = buf.len();
206 loop {
207 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
208 Ok(n) if n.is_empty() => break,
209 Ok(n) => n,
210 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
211 Err(e) => {
212 *position += read;
213 return Err(Error::Io(e.into()));
214 }
215 };
216
217 if let Some(i) = parser.feed(available) {
218 buf.extend_from_slice(&available[..i]);
219
220 // +1 for `>` which we do not include
221 self $(.$reader)? .consume(i + 1);
222 read += i as u64 + 1;
223
224 *position += read;
225 return Ok(&buf[start..]);
226 }
227
228 // The `>` symbol not yet found, continue reading
229 buf.extend_from_slice(available);
230
231 let used = available.len();
232 self $(.$reader)? .consume(used);
233 read += used as u64;
234 }
235
236 *position += read;
237 Err(Error::Syntax(P::eof_error()))
238 }
239
240 #[inline]
241 $($async)? fn read_bang_element $(<$lf>)? (
242 &mut self,
243 buf: &'b mut Vec<u8>,
244 position: &mut u64,
245 ) -> Result<(BangType, &'b [u8])> {
246 // Peeked one bang ('!') before being called, so it's guaranteed to
247 // start with it.
248 let start = buf.len();
249 let mut read = 1;
250 buf.push(b'!');
251 self $(.$reader)? .consume(1);
252
253 let mut bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
254
255 loop {
256 match self $(.$reader)? .fill_buf() $(.$await)? {
257 // Note: Do not update position, so the error points to
258 // somewhere sane rather than at the EOF
259 Ok(n) if n.is_empty() => break,
260 Ok(available) => {
261 // We only parse from start because we don't want to consider
262 // whatever is in the buffer before the bang element
263 if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
264 buf.extend_from_slice(consumed);
265
266 self $(.$reader)? .consume(used);
267 read += used as u64;
268
269 *position += read;
270 return Ok((bang_type, &buf[start..]));
271 } else {
272 buf.extend_from_slice(available);
273
274 let used = available.len();
275 self $(.$reader)? .consume(used);
276 read += used as u64;
277 }
278 }
279 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
280 Err(e) => {
281 *position += read;
282 return Err(Error::Io(e.into()));
283 }
284 }
285 }
286
287 *position += read;
288 Err(bang_type.to_err().into())
289 }
290
291 #[inline]
292 $($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
293 loop {
294 break match self $(.$reader)? .fill_buf() $(.$await)? {
295 Ok(n) => {
296 let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
297 if count > 0 {
298 self $(.$reader)? .consume(count);
299 *position += count as u64;
300 continue;
301 } else {
302 Ok(())
303 }
304 }
305 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
306 Err(e) => Err(e),
307 };
308 }
309 }
310
311 #[inline]
312 $($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
313 loop {
314 break match self $(.$reader)? .fill_buf() $(.$await)? {
315 Ok(n) => Ok(n.first().cloned()),
316 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
317 Err(e) => Err(e),
318 };
319 }
320 }
321 };
322}
323
324// Make it public for use in async implementations.
325// New rustc reports
326// > warning: the item `impl_buffered_source` is imported redundantly
327// so make it public only when async feature is enabled
328#[cfg(feature = "async-tokio")]
329pub(super) use impl_buffered_source;
330
331/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
332/// `Vec<u8>` as buffer that will be borrowed by events.
333impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
334 impl_buffered_source!();
335}
336
337////////////////////////////////////////////////////////////////////////////////////////////////////
338
339/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
340impl<R: BufRead> Reader<R> {
341 /// Reads the next `Event`.
342 ///
343 /// This is the main entry point for reading XML `Event`s.
344 ///
345 /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
346 /// internally).
347 ///
348 /// Having the possibility to control the internal buffers gives you some additional benefits
349 /// such as:
350 ///
351 /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
352 /// you can call `buf.clear()` once you are done with processing the event (typically at the
353 /// end of your loop).
354 /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
355 ///
356 /// # Examples
357 ///
358 /// ```
359 /// # use pretty_assertions::assert_eq;
360 /// use quick_xml::events::Event;
361 /// use quick_xml::reader::Reader;
362 ///
363 /// let xml = r#"<tag1 att1 = "test">
364 /// <tag2><!--Test comment-->Test</tag2>
365 /// <tag2>Test 2</tag2>
366 /// </tag1>"#;
367 /// let mut reader = Reader::from_str(xml);
368 /// reader.config_mut().trim_text(true);
369 /// let mut count = 0;
370 /// let mut buf = Vec::new();
371 /// let mut txt = Vec::new();
372 /// loop {
373 /// match reader.read_event_into(&mut buf) {
374 /// Ok(Event::Start(_)) => count += 1,
375 /// Ok(Event::Text(e)) => txt.push(e.decode().unwrap().into_owned()),
376 /// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
377 /// Ok(Event::Eof) => break,
378 /// _ => (),
379 /// }
380 /// buf.clear();
381 /// }
382 /// assert_eq!(count, 3);
383 /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
384 /// ```
385 #[inline]
386 pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
387 self.read_event_impl(buf)
388 }
389
390 /// Reads until end element is found using provided buffer as intermediate
391 /// storage for events content. This function is supposed to be called after
392 /// you already read a [`Start`] event.
393 ///
394 /// Returns a span that cover content between `>` of an opening tag and `<` of
395 /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
396 /// this method was called after reading expanded [`Start`] event.
397 ///
398 /// Manages nested cases where parent and child elements have the _literally_
399 /// same name.
400 ///
401 /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
402 /// will be returned. In particularly, that error will be returned if you call
403 /// this method without consuming the corresponding [`Start`] event first.
404 ///
405 /// If your reader created from a string slice or byte array slice, it is
406 /// better to use [`read_to_end()`] method, because it will not copy bytes
407 /// into intermediate buffer.
408 ///
409 /// The provided `buf` buffer will be filled only by one event content at time.
410 /// Before reading of each event the buffer will be cleared. If you know an
411 /// appropriate size of each event, you can preallocate the buffer to reduce
412 /// number of reallocations.
413 ///
414 /// The `end` parameter should contain name of the end element _in the reader
415 /// encoding_. It is good practice to always get that parameter using
416 /// [`BytesStart::to_end()`] method.
417 ///
418 /// The correctness of the skipped events does not checked, if you disabled
419 /// the [`check_end_names`] option.
420 ///
421 /// # Namespaces
422 ///
423 /// While the `Reader` does not support namespace resolution, namespaces
424 /// does not change the algorithm for comparing names. Although the names
425 /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
426 /// same namespace, are semantically equivalent, `</b:name>` cannot close
427 /// `<a:name>`, because according to [the specification]
428 ///
429 /// > The end of every element that begins with a **start-tag** MUST be marked
430 /// > by an **end-tag** containing a name that echoes the element's type as
431 /// > given in the **start-tag**
432 ///
433 /// # Examples
434 ///
435 /// This example shows, how you can skip XML content after you read the
436 /// start event.
437 ///
438 /// ```
439 /// # use pretty_assertions::assert_eq;
440 /// use quick_xml::events::{BytesStart, Event};
441 /// use quick_xml::reader::Reader;
442 ///
443 /// let mut reader = Reader::from_str(r#"
444 /// <outer>
445 /// <inner>
446 /// <inner></inner>
447 /// <inner/>
448 /// <outer></outer>
449 /// <outer/>
450 /// </inner>
451 /// </outer>
452 /// "#);
453 /// reader.config_mut().trim_text(true);
454 /// let mut buf = Vec::new();
455 ///
456 /// let start = BytesStart::new("outer");
457 /// let end = start.to_end().into_owned();
458 ///
459 /// // First, we read a start event...
460 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
461 ///
462 /// // ...then, we could skip all events to the corresponding end event.
463 /// // This call will correctly handle nested <outer> elements.
464 /// // Note, however, that this method does not handle namespaces.
465 /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
466 ///
467 /// // At the end we should get an Eof event, because we ate the whole XML
468 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
469 /// ```
470 ///
471 /// [`Start`]: Event::Start
472 /// [`End`]: Event::End
473 /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
474 /// [`read_to_end()`]: Self::read_to_end
475 /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
476 /// [`check_end_names`]: crate::reader::Config::check_end_names
477 /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
478 pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
479 Ok(read_to_end!(self, end, buf, read_event_impl, {
480 buf.clear();
481 }))
482 }
483}
484
485impl Reader<BufReader<File>> {
486 /// Creates an XML reader from a file path.
487 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
488 let file = File::open(path)?;
489 let reader = BufReader::new(file);
490 Ok(Self::from_reader(reader))
491 }
492}
493
494#[cfg(test)]
495mod test {
496 use crate::reader::test::check;
497 use crate::reader::XmlSource;
498
499 /// Default buffer constructor just pass the byte array from the test
500 fn identity<T>(input: T) -> T {
501 input
502 }
503
504 check!(
505 #[test]
506 read_event_impl,
507 read_until_close,
508 identity,
509 &mut Vec::new()
510 );
511}