Skip to content

RUST-2023 Add wrapper type for utf-8 lossy deserialization #497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/de/raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use crate::{
RAW_BSON_NEWTYPE,
RAW_DOCUMENT_NEWTYPE,
},
serde_helpers::HUMAN_READABLE_NEWTYPE,
serde_helpers::{HUMAN_READABLE_NEWTYPE, UTF8_LOSSY_NEWTYPE},
spec::{BinarySubtype, ElementType},
uuid::UUID_NEWTYPE_NAME,
DateTime,
Expand Down Expand Up @@ -297,6 +297,11 @@ impl<'de> serde::de::Deserializer<'de> for Deserializer<'de> {
inner.options.human_readable = true;
visitor.visit_newtype_struct(inner)
}
UTF8_LOSSY_NEWTYPE => {
let mut inner = self;
inner.options.utf8_lossy = true;
visitor.visit_newtype_struct(inner)
}
_ => visitor.visit_newtype_struct(self),
}
}
Expand Down
1 change: 1 addition & 0 deletions src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,7 @@ impl Document {
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
/// in rare cases can contain invalidly truncated strings (<https://jira.mongodb.org/browse/SERVER-24007>).
/// For most use cases, `Document::from_reader` can be used instead.
#[deprecated = "use bson::serde_helpers::Utf8LossyDeserialization"]
pub fn from_reader_utf8_lossy<R: Read>(mut reader: R) -> crate::de::Result<Document> {
Self::decode(&mut reader, true)
}
Expand Down
5 changes: 3 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -295,9 +295,7 @@ pub use self::{
from_document,
from_document_with_options,
from_reader,
from_reader_utf8_lossy,
from_slice,
from_slice_utf8_lossy,
Deserializer,
DeserializerOptions,
},
Expand Down Expand Up @@ -328,6 +326,9 @@ pub use self::{
uuid::{Uuid, UuidRepresentation},
};

#[allow(deprecated)]
pub use self::de::{from_reader_utf8_lossy, from_slice_utf8_lossy,};

#[macro_use]
mod macros;
pub mod binary;
Expand Down
43 changes: 43 additions & 0 deletions src/serde_helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -886,3 +886,46 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for HumanReadable<T> {
deserializer.deserialize_newtype_struct(HUMAN_READABLE_NEWTYPE, V(PhantomData))
}
}

/// Wrapper type for deserializing BSON bytes with invalid UTF-8 sequences.
///
/// Any invalid UTF-8 strings contained in the wrapped type will be replaced with the Unicode
/// replacement character. This wrapper type only has an effect when deserializing from BSON bytes.
///
/// This wrapper type has no impact on serialization. Serializing a `Utf8LossyDeserialization<T>`
/// will call the `serialize` method for the wrapped `T`.
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)]
pub struct Utf8LossyDeserialization<T>(pub T);

pub(crate) const UTF8_LOSSY_NEWTYPE: &str = "$__bson_private_utf8_lossy";

impl<T: Serialize> Serialize for Utf8LossyDeserialization<T> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.0.serialize(serializer)
}
}

impl<'de, T: Deserialize<'de>> Deserialize<'de> for Utf8LossyDeserialization<T> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct V<T>(PhantomData<fn() -> T>);
impl<'de, T: Deserialize<'de>> Visitor<'de> for V<T> {
type Value = Utf8LossyDeserialization<T>;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
formatter.write_str("Utf8Lossy wrapper")
}
fn visit_newtype_struct<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
where
D: serde::Deserializer<'de>,
{
T::deserialize(deserializer).map(Utf8LossyDeserialization)
}
}
deserializer.deserialize_newtype_struct(UTF8_LOSSY_NEWTYPE, V(PhantomData))
}
}
1 change: 1 addition & 0 deletions src/tests/modules/serializer_deserializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ fn test_encode_decode_utf8_string_invalid() {
doc.to_writer(&mut buf).unwrap();

let expected = doc! { "key": "��" };
#[allow(deprecated)]
let decoded = Document::from_reader_utf8_lossy(&mut Cursor::new(buf)).unwrap();
assert_eq!(decoded, expected);
}
Expand Down
54 changes: 53 additions & 1 deletion src/tests/serde_helpers.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
use core::str;

use serde::{de::Visitor, Deserialize, Serialize};

use crate::serde_helpers::HumanReadable;
use crate::{
from_slice,
serde_helpers::{HumanReadable, Utf8LossyDeserialization},
};

#[test]
fn human_readable_wrapper() {
Expand Down Expand Up @@ -135,3 +140,50 @@ fn human_readable_wrapper() {
let raw_tripped: Data = crate::from_slice(&bytes).unwrap();
assert_eq!(&raw_tripped, &expected);
}

#[test]
#[allow(dead_code)] // suppress warning for unread fields
fn utf8_lossy_wrapper() {
let invalid_bytes = b"\x80\xae".to_vec();
let invalid_string = unsafe { String::from_utf8_unchecked(invalid_bytes) };

let both_strings_invalid_bytes =
rawdoc! { "s1": invalid_string.clone(), "s2": invalid_string.clone() }.into_bytes();
let first_string_invalid_bytes =
rawdoc! { "s1": invalid_string.clone(), "s2": ":)" }.into_bytes();

let expected_replacement = "��".to_string();

#[derive(Debug, Deserialize)]
struct NoUtf8Lossy {
s1: String,
s2: String,
}

from_slice::<NoUtf8Lossy>(&both_strings_invalid_bytes).unwrap_err();

let s = from_slice::<Utf8LossyDeserialization<NoUtf8Lossy>>(&both_strings_invalid_bytes)
.unwrap()
.0;
assert_eq!(s.s1, expected_replacement);
assert_eq!(s.s2, expected_replacement);

#[derive(Debug, Deserialize)]
struct FirstStringUtf8Lossy {
s1: Utf8LossyDeserialization<String>,
s2: String,
}

let s = from_slice::<FirstStringUtf8Lossy>(&first_string_invalid_bytes).unwrap();
assert_eq!(s.s1.0, expected_replacement);
assert_eq!(&s.s2, ":)");

from_slice::<FirstStringUtf8Lossy>(&both_strings_invalid_bytes).unwrap_err();

let s =
from_slice::<Utf8LossyDeserialization<FirstStringUtf8Lossy>>(&both_strings_invalid_bytes)
.unwrap()
.0;
assert_eq!(s.s1.0, expected_replacement);
assert_eq!(s.s2, expected_replacement);
}
4 changes: 4 additions & 0 deletions src/tests/spec/corpus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::{

use crate::{
raw::{RawBsonRef, RawDocument},
serde_helpers::Utf8LossyDeserialization,
tests::LOCK,
Bson,
Document,
Expand Down Expand Up @@ -549,12 +550,15 @@ fn run_test(test: TestFile) {
crate::from_reader::<_, Document>(bson.as_slice()).expect_err(description.as_str());

if decode_error.description.contains("invalid UTF-8") {
#[allow(deprecated)]
crate::from_reader_utf8_lossy::<_, Document>(bson.as_slice()).unwrap_or_else(|err| {
panic!(
"{}: utf8_lossy should not fail (failed with {:?})",
description, err
)
});
crate::from_slice::<Utf8LossyDeserialization<Document>>(bson.as_slice())
.expect(&description);
}
}

Expand Down