Skip to content

Serde support for serializing and deserializing binary blobs in XML files #788

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@

### New Features

- [#623]: Added `Reader::stream()` that can be used to read arbitrary data
from the inner reader while track position for XML reader.

### Bug Fixes

### Misc Changes

[#623]: https://github.com/tafia/quick-xml/issues/623


## 0.36.0 -- 2024-07-08

Expand Down
7 changes: 5 additions & 2 deletions src/de/map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,15 +247,15 @@ where
// We shouldn't have both `$value` and `$text` fields in the same
// struct, so if we have `$value` field, the we should deserialize
// text content to `$value`
DeEvent::Text(_) if self.has_value_field => {
DeEvent::Text(_) | DeEvent::Binary(_) if self.has_value_field => {
self.source = ValueSource::Content;
// Deserialize `key` from special attribute name which means
// that value should be taken from the text content of the
// XML node
let de = BorrowedStrDeserializer::<DeError>::new(VALUE_KEY);
seed.deserialize(de).map(Some)
}
DeEvent::Text(_) => {
DeEvent::Text(_) | DeEvent::Binary(_) => {
self.source = ValueSource::Text;
// Deserialize `key` from special attribute name which means
// that value should be taken from the text content of the
Expand Down Expand Up @@ -943,6 +943,9 @@ where
// SAFETY: we just checked that the next event is Text
_ => unreachable!(),
},
DeEvent::Binary(_) => Err(Self::Error::Unsupported(
"undecodable binary data among a sequence of xml elements".into(),
)),
DeEvent::Start(_) => match self.map.de.next()? {
DeEvent::Start(start) => seed
.deserialize(ElementDeserializer {
Expand Down
74 changes: 71 additions & 3 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2056,6 +2056,31 @@ impl<'a> From<&'a str> for Text<'a> {
}
}

/// Docs
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Binary<'a> {
/// Field
pub text: Cow<'a, [u8]>,
}

impl<'a> Deref for Binary<'a> {
type Target = [u8];

#[inline]
fn deref(&self) -> &Self::Target {
self.text.deref()
}
}

impl<'a> From<&'a [u8]> for Binary<'a> {
#[inline]
fn from(text: &'a [u8]) -> Self {
Self {
text: Cow::Borrowed(text),
}
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Simplified event which contains only these variants that used by deserializer
Expand All @@ -2074,6 +2099,8 @@ pub enum DeEvent<'a> {
/// [`Comment`]: Event::Comment
/// [`PI`]: Event::PI
Text(Text<'a>),
/// Binary undecoded
Binary(Binary<'a>),
/// End of XML document.
Eof,
}
Expand Down Expand Up @@ -2217,7 +2244,16 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
// FIXME: Actually, we should trim after decoding text, but now we trim before
continue;
}
self.drain_text(e.unescape_with(|entity| self.entity_resolver.resolve(entity))?)
match e
.unescape_with(|entity| self.entity_resolver.resolve(entity))
.map(|res| self.drain_text(res))
{
Ok(x) => x,
// failed to escape treat as binary blob.
Err(_) => Ok(DeEvent::Binary(Binary {
text: e.into_inner(),
})),
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We definitely shouldn't rely on luck here. Binary should be explicitly requested for the field via flag in field name

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the best way/mechanism to maintain context in the code to keep track of flags like that?

}
PayloadEvent::CData(e) => self.drain_text(e.decode()?),
PayloadEvent::DocType(e) => {
Expand Down Expand Up @@ -2687,6 +2723,8 @@ where
fn read_string_impl(&mut self, allow_start: bool) -> Result<Cow<'de, str>, DeError> {
match self.next()? {
DeEvent::Text(e) => Ok(e.text),
// SAFETY: Binary event should never be emitted for decoded strings.
DeEvent::Binary(e) => unreachable!("{:?}", e),
// allow one nested level
DeEvent::Start(e) if allow_start => self.read_text(e.name()),
DeEvent::Start(e) => Err(DeError::UnexpectedStart(e.name().as_ref().to_owned())),
Expand All @@ -2708,10 +2746,12 @@ where
// The matching tag name is guaranteed by the reader
DeEvent::End(_) => Ok(e.text),
// SAFETY: Cannot be two consequent Text events, they would be merged into one
DeEvent::Text(_) => unreachable!(),
DeEvent::Text(_) | DeEvent::Binary(_) => unreachable!(),
DeEvent::Start(e) => Err(DeError::UnexpectedStart(e.name().as_ref().to_owned())),
DeEvent::Eof => Err(Error::missed_end(name, self.reader.decoder()).into()),
},
// SAFETY: Binary event should never be emitted for decoded strings.
DeEvent::Binary(e) => unreachable!("{:?}", e),
// We can get End event in case of `<tag></tag>` or `<tag/>` input
// Return empty text in that case
// The matching tag name is guaranteed by the reader
Expand Down Expand Up @@ -2827,6 +2867,30 @@ where
}
}

impl<'de, R> Deserializer<'de, IoReader<R>>
where
R: BufRead,
{
/// Create new deserializer that will copy data from the specified reader
/// into internal buffer.
///
/// If you already have a string use [`Self::from_str`] instead, because it
/// will borrow instead of copy. If you have `&[u8]` which is known to represent
/// UTF-8, you can decode it first before using [`from_str`].
///
/// Deserializer created with this method will not resolve custom entities.
pub fn from_custom_reader(reader: Reader<R>) -> Self {
Self::new(
IoReader {
reader,
start_trimmer: StartTrimmer::default(),
buf: Vec::new(),
},
PredefinedEntityResolver,
)
}
}

impl<'de, R, E> Deserializer<'de, IoReader<R>, E>
where
R: BufRead,
Expand Down Expand Up @@ -2884,6 +2948,10 @@ where
Cow::Borrowed(s) => visitor.visit_borrowed_str(s),
Cow::Owned(s) => visitor.visit_string(s),
},
DeEvent::Binary(e) => match e.text {
Cow::Borrowed(s) => visitor.visit_borrowed_bytes(s),
Cow::Owned(s) => visitor.visit_byte_buf(s),
},
DeEvent::Eof => Err(DeError::UnexpectedEof),
}
}
Expand Down Expand Up @@ -2914,7 +2982,7 @@ where
self.read_to_end(s.name())?;
visitor.visit_unit()
}
DeEvent::Text(_) => visitor.visit_unit(),
DeEvent::Text(_) | DeEvent::Binary(_) => visitor.visit_unit(),
// SAFETY: The reader is guaranteed that we don't have unmatched tags
// If we here, then out deserializer has a bug
DeEvent::End(e) => unreachable!("{:?}", e),
Expand Down
2 changes: 1 addition & 1 deletion src/de/var.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ where
seed.deserialize(QNameDeserializer::from_elem(e.raw_name(), decoder)?)?,
false,
),
DeEvent::Text(_) => (
DeEvent::Text(_) | DeEvent::Binary(_) => (
seed.deserialize(BorrowedStrDeserializer::<DeError>::new(TEXT_KEY))?,
true,
),
Expand Down
6 changes: 6 additions & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -464,4 +464,10 @@ pub mod serialize {
Self::Custom(e.to_string())
}
}
impl From<std::io::Error> for DeError {
#[inline]
fn from(e: std::io::Error) -> Self {
Self::Custom(e.to_string())
}
}
}
48 changes: 46 additions & 2 deletions src/reader/async_tokio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
//! as underlying byte stream. This reader fully implements async/await so reading
//! can use non-blocking I/O.

use tokio::io::{self, AsyncBufRead, AsyncBufReadExt};
use std::pin::Pin;
use std::task::{Context, Poll};

use tokio::io::{self, AsyncBufRead, AsyncBufReadExt, AsyncRead, ReadBuf};

use crate::errors::{Error, Result, SyntaxError};
use crate::events::Event;
use crate::name::{QName, ResolveResult};
use crate::parser::{ElementParser, Parser, PiParser};
use crate::reader::buffered_reader::impl_buffered_source;
use crate::reader::{BangType, NsReader, ParseState, ReadTextResult, Reader, Span};
use crate::reader::{BangType, BinaryStream, NsReader, ParseState, ReadTextResult, Reader, Span};
use crate::utils::is_whitespace;

/// A struct for read XML asynchronously from an [`AsyncBufRead`].
Expand All @@ -24,6 +27,47 @@ impl<'a, R: AsyncBufRead + Unpin> TokioAdapter<'a, R> {

////////////////////////////////////////////////////////////////////////////////////////////////////

impl<'r, R> AsyncRead for BinaryStream<'r, R>
where
R: AsyncRead + Unpin,
{
fn poll_read(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
buf: &mut ReadBuf<'_>,
) -> Poll<io::Result<()>> {
let start = buf.remaining();
let this = self.get_mut();
let poll = Pin::new(&mut *this.inner).poll_read(cx, buf);

// If something was read, update offset
if let Poll::Ready(Ok(_)) = poll {
let amt = start - buf.remaining();
*this.offset += amt as u64;
}
poll
}
}

impl<'r, R> AsyncBufRead for BinaryStream<'r, R>
where
R: AsyncBufRead + Unpin,
{
#[inline]
fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
Pin::new(&mut *self.get_mut().inner).poll_fill_buf(cx)
}

#[inline]
fn consume(self: Pin<&mut Self>, amt: usize) {
let this = self.get_mut();
this.inner.consume(amt);
*this.offset += amt as u64;
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

impl<R: AsyncBufRead + Unpin> Reader<R> {
/// An asynchronous version of [`read_event_into()`]. Reads the next event into
/// given buffer.
Expand Down
113 changes: 113 additions & 0 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,65 @@ impl EncodingRef {

////////////////////////////////////////////////////////////////////////////////////////////////////

/// A direct stream to the underlying [`Reader`]s reader which updates
/// [`Reader::buffer_position()`] when read from it.
#[derive(Debug)]
#[must_use = "streams do nothing unless read or polled"]
pub struct BinaryStream<'r, R> {
inner: &'r mut R,
offset: &'r mut u64,
}

impl<'r, R> BinaryStream<'r, R> {
/// Returns current position in bytes in the original source.
#[inline]
pub const fn offset(&self) -> u64 {
*self.offset
}

/// Gets a reference to the underlying reader.
#[inline]
pub const fn get_ref(&self) -> &R {
self.inner
}

/// Gets a mutable reference to the underlying reader.
#[inline]
pub fn get_mut(&mut self) -> &mut R {
self.inner
}
}

impl<'r, R> io::Read for BinaryStream<'r, R>
where
R: io::Read,
{
#[inline]
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let amt = self.inner.read(buf)?;
*self.offset += amt as u64;
Ok(amt)
}
}

impl<'r, R> io::BufRead for BinaryStream<'r, R>
where
R: io::BufRead,
{
#[inline]
fn fill_buf(&mut self) -> io::Result<&[u8]> {
self.inner.fill_buf()
}

#[inline]
fn consume(&mut self, amt: usize) {
self.inner.consume(amt);
*self.offset += amt as u64;
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

/// A low level encoding-agnostic XML event reader.
///
/// Consumes bytes and streams XML [`Event`]s.
Expand Down Expand Up @@ -759,6 +818,60 @@ impl<R> Reader<R> {
pub const fn decoder(&self) -> Decoder {
self.state.decoder()
}

/// Get the direct access to the underlying reader, but tracks the amount of
/// read data and update [`Reader::buffer_position()`] accordingly.
///
/// # Example
///
/// This example demonstrates, how it is possible to read embedded binary data.
/// Such XML documents are exist in the wild.
///
/// ```
/// # use pretty_assertions::assert_eq;
/// use std::io::{BufRead, Read};
/// use quick_xml::events::{BytesEnd, BytesStart, Event};
/// use quick_xml::reader::Reader;
///
/// let mut reader = Reader::from_str("<tag>binary << data&></tag>");
/// // ^ ^ ^ ^
/// // 0 5 21 27
///
/// assert_eq!(
/// (reader.read_event().unwrap(), reader.buffer_position()),
/// // 5 - end of the `<tag>`
/// (Event::Start(BytesStart::new("tag")), 5)
/// );
///
/// // Reading directly from underlying reader will not update position
/// // let mut inner = reader.get_mut();
///
/// // Reading from the stream() advances position
/// let mut inner = reader.stream();
///
/// // Read binary data. We somehow should known its size
/// let mut binary = [0u8; 16];
/// inner.read_exact(&mut binary).unwrap();
/// assert_eq!(&binary, b"binary << data&>");
/// // 21 - end of the `binary << data&>`
/// assert_eq!(inner.offset(), 21);
/// assert_eq!(reader.buffer_position(), 21);
///
/// assert_eq!(
/// (reader.read_event().unwrap(), reader.buffer_position()),
/// // 27 - end of the `</tag>`
/// (Event::End(BytesEnd::new("tag")), 27)
/// );
///
/// assert_eq!(reader.read_event().unwrap(), Event::Eof);
/// ```
#[inline]
pub fn stream(&mut self) -> BinaryStream<R> {
BinaryStream {
inner: &mut self.reader,
offset: &mut self.state.offset,
}
}
}

/// Private sync reading methods
Expand Down
Loading