mongodb · patrickfreed · Jul 8, 2021 · Jun 3, 2021 · Jun 23, 2021 · Jun 24, 2021
diff --git a/src/de/mod.rs b/src/de/mod.rs
@@ -405,12 +405,7 @@ where
     from_bson(Bson::Document(doc))
 }
 
-/// Decode BSON bytes from the provided reader into a `T` Deserializable.
-pub fn from_reader<R, T>(mut reader: R) -> Result<T>
-where
-    T: DeserializeOwned,
-    R: Read,
-{
+fn reader_to_vec<R: Read>(mut reader: R) -> Result<Vec<u8>> {
     let length = read_i32(&mut reader)?;
 
     if length < MIN_BSON_DOCUMENT_SIZE {
@@ -421,16 +416,53 @@ where
     write_i32(&mut bytes, length).map_err(Error::custom)?;
 
     reader.take(length as u64 - 4).read_to_end(&mut bytes)?;
+    Ok(bytes)
+}
 
-    let mut deserializer = raw::Deserializer::new(bytes.as_slice());
-    T::deserialize(&mut deserializer)
+/// Deserialize an instance of type `T` from an I/O stream of BSON.
+pub fn from_reader<R, T>(reader: R) -> Result<T>
+where
+    T: DeserializeOwned,
+    R: Read,
+{
+    let bytes = reader_to_vec(reader)?;
+    from_slice(bytes.as_slice())
+}
+
+/// Deserialize an instance of type `T` from an I/O stream of BSON, replacing any invalid UTF-8
+/// sequences with the Unicode replacement character.
+///
+/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
+/// in rare cases can contain invalidly truncated strings (https://jira.mongodb.org/browse/SERVER-24007).
+/// For most use cases, `bson::from_slice` can be used instead.
+pub fn from_reader_utf8_lossy<R, T>(reader: R) -> Result<T>
+where
+    T: DeserializeOwned,
+    R: Read,
+{
+    let bytes = reader_to_vec(reader)?;
+    from_slice_utf8_lossy(bytes.as_slice())
 }
 
-/// Decode BSON bytes from the provided reader into a `T` Deserializable.
+/// Deserialize an instance of type `T` from a slice of BSON bytes.
 pub fn from_slice<'de, T>(bytes: &'de [u8]) -> Result<T>
 where
     T: Deserialize<'de>,
 {
-    let mut deserializer = raw::Deserializer::new(bytes);
+    let mut deserializer = raw::Deserializer::new(bytes, false);
+    T::deserialize(&mut deserializer)
+}
+
+/// Deserialize an instance of type `T` from a slice of BSON bytes, replacing any invalid UTF-8
+/// sequences with the Unicode replacement character.
+///
+/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
+/// in rare cases can contain invalidly truncated strings (https://jira.mongodb.org/browse/SERVER-24007).
+/// For most use cases, `bson::from_slice` can be used instead.
+pub fn from_slice_utf8_lossy<'de, T>(bytes: &'de [u8]) -> Result<T>
+where
+    T: Deserialize<'de>,
+{
+    let mut deserializer = raw::Deserializer::new(bytes, true);
     T::deserialize(&mut deserializer)
 }
diff --git a/src/de/raw.rs b/src/de/raw.rs
@@ -50,9 +50,9 @@ pub(crate) struct Deserializer<'de> {
 }
 
 impl<'de> Deserializer<'de> {
-    pub(crate) fn new(buf: &'de [u8]) -> Self {
+    pub(crate) fn new(buf: &'de [u8], utf8_lossy: bool) -> Self {
         Self {
-            bytes: BsonBuf::new(buf),
+            bytes: BsonBuf::new(buf, utf8_lossy),
             current_type: ElementType::EmbeddedDocument,
         }
     }
@@ -87,13 +87,14 @@ impl<'de> Deserializer<'de> {
     }
 
     /// Read a string from the BSON.
-    /// This will be an owned string if invalid UTF-8 is encountered in the string, otherwise it
-    /// will be borrowed.
+    ///
+    /// If utf8_lossy, this will be an owned string if invalid UTF-8 is encountered in the string,
+    /// otherwise it will be borrowed.
     fn deserialize_str(&mut self) -> Result<Cow<'de, str>> {
         self.bytes.read_str()
     }
 
-    fn deserialize_document_key(&mut self) -> Result<&'de str> {
+    fn deserialize_document_key(&mut self) -> Result<Cow<'de, str>> {
         self.bytes.read_cstr()
     }
 
@@ -441,7 +442,10 @@ impl<'d, 'de> serde::de::Deserializer<'de> for DocumentKeyDeserializer<'d, 'de>
         V: serde::de::Visitor<'de>,
     {
         let s = self.root_deserializer.deserialize_document_key()?;
-        visitor.visit_borrowed_str(s)
+        match s {
+            Cow::Borrowed(b) => visitor.visit_borrowed_str(b),
+            Cow::Owned(string) => visitor.visit_string(string),
+        }
     }
 
     forward_to_deserialize_any! {
@@ -870,6 +874,10 @@ enum BinaryDeserializationStage {
 struct BsonBuf<'a> {
     bytes: &'a [u8],
     index: usize,
+
+    /// Whether or not to insert replacement characters in place of invalid UTF-8 sequences when
+    /// deserializing strings.
+    utf8_lossy: bool,
 }
 
 impl<'a> Read for BsonBuf<'a> {
@@ -882,8 +890,12 @@ impl<'a> Read for BsonBuf<'a> {
 }
 
 impl<'a> BsonBuf<'a> {
-    fn new(bytes: &'a [u8]) -> Self {
-        Self { bytes, index: 0 }
+    fn new(bytes: &'a [u8], utf8_lossy: bool) -> Self {
+        Self {
+            bytes,
+            index: 0,
+            utf8_lossy,
+        }
     }
 
     fn bytes_read(&self) -> usize {
@@ -898,20 +910,39 @@ impl<'a> BsonBuf<'a> {
         Ok(())
     }
 
-    fn read_cstr(&mut self) -> Result<&'a str> {
+    /// Get the starting at the provided index and ending at the buffer's current index.
+    fn str(&mut self, start: usize) -> Result<Cow<'a, str>> {
+        let bytes = &self.bytes[start..self.index];
+        let s = if self.utf8_lossy {
+            String::from_utf8_lossy(bytes)
+        } else {
+            Cow::Borrowed(std::str::from_utf8(bytes).map_err(Error::custom)?)
+        };
+
+        // consume the null byte
+        if self.bytes[self.index] != 0 {
+            return Err(Error::custom("string was not null-terminated"));
+        }
+        self.index += 1;
+        self.index_check()?;
+
+        Ok(s)
+    }
+
+    /// Attempts to read a null-terminated UTF-8 cstring from the data.
+    ///
+    /// If utf8_lossy and invalid UTF-8 is encountered, the unicode replacement character will be
+    /// inserted in place of the offending data, resulting in an owned `String`. Otherwise, the
+    /// data will be borrowed as-is.
+    fn read_cstr(&mut self) -> Result<Cow<'a, str>> {
         let start = self.index;
         while self.index < self.bytes.len() && self.bytes[self.index] != 0 {
             self.index += 1
         }
 
         self.index_check()?;
 
-        let s = std::str::from_utf8(&self.bytes[start..self.index]).map_err(Error::custom);
-        // consume the null byte
-        self.index += 1;
-        self.index_check()?;
-
-        s
+        self.str(start)
     }
 
     /// Attempts to read a null-terminated UTF-8 string from the data.
@@ -934,16 +965,7 @@ impl<'a> BsonBuf<'a> {
         self.index += (len - 1) as usize;
         self.index_check()?;
 
-        let s = String::from_utf8_lossy(&self.bytes[start..self.index]);
-
-        // consume the null byte
-        if self.bytes[self.index] != 0 {
-            return Err(Error::custom("string was not null-terminated"));
-        }
-        self.index += 1;
-        self.index_check()?;
-
-        Ok(s)
+        self.str(start)
     }
 
     fn read_slice(&mut self, length: usize) -> Result<&'a [u8]> {

diff --git a/src/lib.rs b/src/lib.rs
@@ -188,7 +188,15 @@
 pub use self::{
     bson::{Array, Binary, Bson, DbPointer, Document, JavaScriptCodeWithScope, Regex, Timestamp},
     datetime::DateTime,
-    de::{from_bson, from_document, from_reader, from_slice, Deserializer},
+    de::{
+        from_bson,
+        from_document,
+        from_reader,
+        from_reader_utf8_lossy,
+        from_slice,
+        from_slice_utf8_lossy,
+        Deserializer,
+    },
     decimal128::Decimal128,
     ser::{to_bson, to_document, Serializer},
 };

diff --git a/src/tests/spec/corpus.rs b/src/tests/spec/corpus.rs
@@ -163,24 +163,6 @@ fn run_test(test: TestFile) {
             }
         }
 
-        for decode_error in test.decode_errors.iter() {
-            // No meaningful definition of "byte count" for an arbitrary reader.
-            if decode_error.description
-                == "Stated length less than byte count, with garbage after envelope"
-            {
-                continue;
-            }
-
-            let bson = hex::decode(&decode_error.bson).expect("should decode from hex");
-            Document::from_reader(bson.as_slice()).expect_err(decode_error.description.as_str());
-
-            // the from_reader implementation supports deserializing from lossy UTF-8
-            if !decode_error.description.contains("invalid UTF-8") {
-                crate::from_reader::<_, Document>(bson.as_slice())
-                    .expect_err(decode_error.description.as_str());
-            }
-        }
-
         // TODO RUST-36: Enable decimal128 tests.
         // extJSON not implemented for decimal128 without the feature flag, so we must stop here.
         if test.bson_type == "0x13" && !cfg!(feature = "decimal128") {
@@ -333,6 +315,32 @@ fn run_test(test: TestFile) {
         }
     }
 
+    for decode_error in test.decode_errors.iter() {
+        // No meaningful definition of "byte count" for an arbitrary reader.
+        if decode_error.description
+            == "Stated length less than byte count, with garbage after envelope"
+        {
+            continue;
+        }
+
+        let description = format!(
+            "{} decode error: {}",
+            test.bson_type, decode_error.description
+        );
+        let bson = hex::decode(&decode_error.bson).expect("should decode from hex");
+        Document::from_reader(bson.as_slice()).expect_err(&description);
+        crate::from_reader::<_, Document>(bson.as_slice()).expect_err(description.as_str());
+
+        if decode_error.description.contains("invalid UTF-8") {
+            let d = crate::from_reader_utf8_lossy::<_, Document>(bson.as_slice())
+                .unwrap_or_else(|_| panic!("{}: utf8_lossy should not fail", description));
+            if let Some(ref key) = test.test_key {
+                d.get_str(key)
+                    .unwrap_or_else(|_| panic!("{}: value should be a string", description));
+            }
+        }
+    }
+
     for parse_error in test.parse_errors {
         // TODO RUST-36: Enable decimal128 tests.
         if test.bson_type == "0x13" {