Skip to content

RUST-870 Support deserializing directly from raw BSON #276

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7d1e550
wip raw deserializer
patrickfreed Jun 3, 2021
799f18b
wip binary
patrickfreed Jun 23, 2021
382b742
wip raw deserialization
patrickfreed Jun 24, 2021
4b70b1f
finish datetime deserialization
patrickfreed Jun 28, 2021
761068c
regex deserialization
patrickfreed Jun 28, 2021
845e9f8
dbpointer code code w scope
patrickfreed Jun 28, 2021
ca69e9b
take ownership in to_extended_document
patrickfreed Jun 28, 2021
fdec423
timestamp
patrickfreed Jun 28, 2021
2ca22ea
minkey maxkey
patrickfreed Jun 28, 2021
9030336
decimal128
patrickfreed Jun 28, 2021
45a99ad
combine documentaccess and seqaccess
patrickfreed Jun 28, 2021
488ed41
deduplicate deserialization logic
patrickfreed Jun 28, 2021
3b44841
wip corpus
patrickfreed Jun 28, 2021
6728082
finish corpus
patrickfreed Jun 29, 2021
1f667bb
ensure serde tests are passing, decimal128, enums
patrickfreed Jun 30, 2021
93ca2fe
slice based deserialization
patrickfreed Jul 2, 2021
caa9f5b
borrow keys in Bson deserialization
patrickfreed Jul 2, 2021
3647aa8
Revert "borrow keys in Bson deserialization"
patrickfreed Jul 2, 2021
90942ba
add test for all types
patrickfreed Jul 2, 2021
cc8c8ff
various cleanup
patrickfreed Jul 2, 2021
e14a74f
bump MSRV to 1.48
patrickfreed Jul 2, 2021
916e782
handle overreads
patrickfreed Jul 2, 2021
e9b38d3
consolidate key deserialization
patrickfreed Jul 2, 2021
77668c6
enable decimal128 tests, use $decimal128 instead of $decimal128Bytes
patrickfreed Jul 2, 2021
1ab13af
fully support borrowed deserialization
patrickfreed Jul 2, 2021
a88d7dd
test borrowing some other types
patrickfreed Jul 2, 2021
0511f69
reintroduce support for deserializing decimal128 from extended json
patrickfreed Jul 6, 2021
de68ee9
fix typo
patrickfreed Jul 6, 2021
2db5468
move element type deserialization logic into deserializer
patrickfreed Jul 6, 2021
d10410b
fix typo
patrickfreed Jul 8, 2021
e3036ce
support lossy and non-lossy UTF-8 decoding
patrickfreed Jul 8, 2021
b062873
fix docstring
patrickfreed Jul 8, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 42 additions & 10 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -405,12 +405,7 @@ where
from_bson(Bson::Document(doc))
}

/// Decode BSON bytes from the provided reader into a `T` Deserializable.
pub fn from_reader<R, T>(mut reader: R) -> Result<T>
where
T: DeserializeOwned,
R: Read,
{
fn reader_to_vec<R: Read>(mut reader: R) -> Result<Vec<u8>> {
let length = read_i32(&mut reader)?;

if length < MIN_BSON_DOCUMENT_SIZE {
Expand All @@ -421,16 +416,53 @@ where
write_i32(&mut bytes, length).map_err(Error::custom)?;

reader.take(length as u64 - 4).read_to_end(&mut bytes)?;
Ok(bytes)
}

let mut deserializer = raw::Deserializer::new(bytes.as_slice());
T::deserialize(&mut deserializer)
/// Deserialize an instance of type `T` from an I/O stream of BSON.
pub fn from_reader<R, T>(reader: R) -> Result<T>
where
T: DeserializeOwned,
R: Read,
{
let bytes = reader_to_vec(reader)?;
from_slice(bytes.as_slice())
}

/// Deserialize an instance of type `T` from an I/O stream of BSON, replacing any invalid UTF-8
/// sequences with the Unicode replacement character.
///
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
/// in rare cases can contain invalidly truncated strings (https://jira.mongodb.org/browse/SERVER-24007).
/// For most use cases, `bson::from_slice` can be used instead.
pub fn from_reader_utf8_lossy<R, T>(reader: R) -> Result<T>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

per our discussion in slack, I added new functions for the lossy UTF-8 approach.

cc @abr-egn @isabelatkinson

where
T: DeserializeOwned,
R: Read,
{
let bytes = reader_to_vec(reader)?;
from_slice_utf8_lossy(bytes.as_slice())
}

/// Decode BSON bytes from the provided reader into a `T` Deserializable.
/// Deserialize an instance of type `T` from a slice of BSON bytes.
pub fn from_slice<'de, T>(bytes: &'de [u8]) -> Result<T>
where
T: Deserialize<'de>,
{
let mut deserializer = raw::Deserializer::new(bytes);
let mut deserializer = raw::Deserializer::new(bytes, false);
T::deserialize(&mut deserializer)
}

/// Deserialize an instance of type `T` from a slice of BSON bytes, replacing any invalid UTF-8
/// sequences with the Unicode replacement character.
///
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
/// in rare cases can contain invalidly truncated strings (https://jira.mongodb.org/browse/SERVER-24007).
/// For most use cases, `bson::from_slice` can be used instead.
pub fn from_slice_utf8_lossy<'de, T>(bytes: &'de [u8]) -> Result<T>
where
T: Deserialize<'de>,
{
let mut deserializer = raw::Deserializer::new(bytes, true);
T::deserialize(&mut deserializer)
}
72 changes: 47 additions & 25 deletions src/de/raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ pub(crate) struct Deserializer<'de> {
}

impl<'de> Deserializer<'de> {
pub(crate) fn new(buf: &'de [u8]) -> Self {
pub(crate) fn new(buf: &'de [u8], utf8_lossy: bool) -> Self {
Self {
bytes: BsonBuf::new(buf),
bytes: BsonBuf::new(buf, utf8_lossy),
current_type: ElementType::EmbeddedDocument,
}
}
Expand Down Expand Up @@ -87,13 +87,14 @@ impl<'de> Deserializer<'de> {
}

/// Read a string from the BSON.
/// This will be an owned string if invalid UTF-8 is encountered in the string, otherwise it
/// will be borrowed.
///
/// If utf8_lossy, this will be an owned string if invalid UTF-8 is encountered in the string,
/// otherwise it will be borrowed.
fn deserialize_str(&mut self) -> Result<Cow<'de, str>> {
self.bytes.read_str()
}

fn deserialize_document_key(&mut self) -> Result<&'de str> {
fn deserialize_document_key(&mut self) -> Result<Cow<'de, str>> {
self.bytes.read_cstr()
}

Expand Down Expand Up @@ -441,7 +442,10 @@ impl<'d, 'de> serde::de::Deserializer<'de> for DocumentKeyDeserializer<'d, 'de>
V: serde::de::Visitor<'de>,
{
let s = self.root_deserializer.deserialize_document_key()?;
visitor.visit_borrowed_str(s)
match s {
Cow::Borrowed(b) => visitor.visit_borrowed_str(b),
Cow::Owned(string) => visitor.visit_string(string),
}
}

forward_to_deserialize_any! {
Expand Down Expand Up @@ -870,6 +874,10 @@ enum BinaryDeserializationStage {
struct BsonBuf<'a> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We use this wrapper struct rather than just a generic Read type as it allows us to borrow from the underlying slice in certain situations.

Originally borrowed deserialization wasn't going to be a part of this work, but it ended up being the most natural way to implement this, so I went ahead with it.

bytes: &'a [u8],
index: usize,

/// Whether or not to insert replacement characters in place of invalid UTF-8 sequences when
/// deserializing strings.
utf8_lossy: bool,
}

impl<'a> Read for BsonBuf<'a> {
Expand All @@ -882,8 +890,12 @@ impl<'a> Read for BsonBuf<'a> {
}

impl<'a> BsonBuf<'a> {
fn new(bytes: &'a [u8]) -> Self {
Self { bytes, index: 0 }
fn new(bytes: &'a [u8], utf8_lossy: bool) -> Self {
Self {
bytes,
index: 0,
utf8_lossy,
}
}

fn bytes_read(&self) -> usize {
Expand All @@ -898,20 +910,39 @@ impl<'a> BsonBuf<'a> {
Ok(())
}

fn read_cstr(&mut self) -> Result<&'a str> {
/// Get the starting at the provided index and ending at the buffer's current index.
fn str(&mut self, start: usize) -> Result<Cow<'a, str>> {
let bytes = &self.bytes[start..self.index];
let s = if self.utf8_lossy {
String::from_utf8_lossy(bytes)
} else {
Cow::Borrowed(std::str::from_utf8(bytes).map_err(Error::custom)?)
};

// consume the null byte
if self.bytes[self.index] != 0 {
return Err(Error::custom("string was not null-terminated"));
}
self.index += 1;
self.index_check()?;

Ok(s)
}

/// Attempts to read a null-terminated UTF-8 cstring from the data.
///
/// If utf8_lossy and invalid UTF-8 is encountered, the unicode replacement character will be
/// inserted in place of the offending data, resulting in an owned `String`. Otherwise, the
/// data will be borrowed as-is.
fn read_cstr(&mut self) -> Result<Cow<'a, str>> {
let start = self.index;
while self.index < self.bytes.len() && self.bytes[self.index] != 0 {
self.index += 1
}

self.index_check()?;

let s = std::str::from_utf8(&self.bytes[start..self.index]).map_err(Error::custom);
// consume the null byte
self.index += 1;
self.index_check()?;

s
self.str(start)
}

/// Attempts to read a null-terminated UTF-8 string from the data.
Expand All @@ -934,16 +965,7 @@ impl<'a> BsonBuf<'a> {
self.index += (len - 1) as usize;
self.index_check()?;

let s = String::from_utf8_lossy(&self.bytes[start..self.index]);

// consume the null byte
if self.bytes[self.index] != 0 {
return Err(Error::custom("string was not null-terminated"));
}
self.index += 1;
self.index_check()?;

Ok(s)
self.str(start)
}

fn read_slice(&mut self, length: usize) -> Result<&'a [u8]> {
Expand Down
10 changes: 9 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,15 @@
pub use self::{
bson::{Array, Binary, Bson, DbPointer, Document, JavaScriptCodeWithScope, Regex, Timestamp},
datetime::DateTime,
de::{from_bson, from_document, from_reader, from_slice, Deserializer},
de::{
from_bson,
from_document,
from_reader,
from_reader_utf8_lossy,
from_slice,
from_slice_utf8_lossy,
Deserializer,
},
decimal128::Decimal128,
ser::{to_bson, to_document, Serializer},
};
Expand Down
44 changes: 26 additions & 18 deletions src/tests/spec/corpus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,24 +163,6 @@ fn run_test(test: TestFile) {
}
}

for decode_error in test.decode_errors.iter() {
// No meaningful definition of "byte count" for an arbitrary reader.
if decode_error.description
== "Stated length less than byte count, with garbage after envelope"
{
continue;
}

let bson = hex::decode(&decode_error.bson).expect("should decode from hex");
Document::from_reader(bson.as_slice()).expect_err(decode_error.description.as_str());

// the from_reader implementation supports deserializing from lossy UTF-8
if !decode_error.description.contains("invalid UTF-8") {
crate::from_reader::<_, Document>(bson.as_slice())
.expect_err(decode_error.description.as_str());
}
}

// TODO RUST-36: Enable decimal128 tests.
// extJSON not implemented for decimal128 without the feature flag, so we must stop here.
if test.bson_type == "0x13" && !cfg!(feature = "decimal128") {
Expand Down Expand Up @@ -333,6 +315,32 @@ fn run_test(test: TestFile) {
}
}

for decode_error in test.decode_errors.iter() {
// No meaningful definition of "byte count" for an arbitrary reader.
if decode_error.description
== "Stated length less than byte count, with garbage after envelope"
{
continue;
}

let description = format!(
"{} decode error: {}",
test.bson_type, decode_error.description
);
let bson = hex::decode(&decode_error.bson).expect("should decode from hex");
Document::from_reader(bson.as_slice()).expect_err(&description);
crate::from_reader::<_, Document>(bson.as_slice()).expect_err(description.as_str());

if decode_error.description.contains("invalid UTF-8") {
let d = crate::from_reader_utf8_lossy::<_, Document>(bson.as_slice())
.unwrap_or_else(|_| panic!("{}: utf8_lossy should not fail", description));
if let Some(ref key) = test.test_key {
d.get_str(key)
.unwrap_or_else(|_| panic!("{}: value should be a string", description));
}
}
}

for parse_error in test.parse_errors {
// TODO RUST-36: Enable decimal128 tests.
if test.bson_type == "0x13" {
Expand Down