From 9767ecf95c91aa59887e9407a5227dc61b988f79 Mon Sep 17 00:00:00 2001 From: Abraham Egnor Date: Mon, 25 Mar 2024 15:56:51 -0400 Subject: [PATCH 1/4] replace to_writer with to_vec --- src/document.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/document.rs b/src/document.rs index 9b0e982b..aa4ee90c 100644 --- a/src/document.rs +++ b/src/document.rs @@ -543,17 +543,20 @@ impl Document { /// # } /// ``` pub fn to_writer(&self, mut writer: W) -> crate::ser::Result<()> { - let mut buf = Vec::new(); - for (key, val) in self.into_iter() { - serialize_bson(&mut buf, key.as_ref(), val)?; - } - - write_i32( - &mut writer, - (buf.len() + mem::size_of::() + mem::size_of::()) as i32, - )?; + // let mut buf = Vec::new(); + // for (key, val) in self.into_iter() { + // serialize_bson(&mut buf, key.as_ref(), val)?; + // } + // + // write_i32( + // &mut writer, + // (buf.len() + mem::size_of::() + mem::size_of::()) as i32, + // )?; + // writer.write_all(&buf)?; + // writer.write_all(&[0])?; + // Ok(()) + let buf = crate::to_vec(self)?; writer.write_all(&buf)?; - writer.write_all(&[0])?; Ok(()) } From 411f3a79903539dfc72329116aec23fe4aba52ad Mon Sep 17 00:00:00 2001 From: Abraham Egnor Date: Mon, 25 Mar 2024 16:31:57 -0400 Subject: [PATCH 2/4] replace decode with deserialize --- src/de/mod.rs | 227 +----------------------------------------------- src/document.rs | 55 +++--------- src/ser/mod.rs | 82 +---------------- 3 files changed, 19 insertions(+), 345 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index 909b71cd..e177fa20 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -33,12 +33,11 @@ pub use self::{ use std::io::Read; use crate::{ - bson::{Array, Bson, DbPointer, Document, JavaScriptCodeWithScope, Regex, Timestamp}, - oid::{self, ObjectId}, + bson::{Bson, Document, Timestamp}, + oid::ObjectId, raw::RawBinaryRef, ser::write_i32, - spec::{self, BinarySubtype}, - Binary, + spec::BinarySubtype, Decimal128, }; @@ -49,7 +48,6 @@ use ::serde::{ pub(crate) use self::serde::{convert_unsigned_to_signed_raw, BsonVisitor}; -#[cfg(test)] pub(crate) use self::raw::Deserializer as RawDeserializer; pub(crate) const MAX_BSON_SIZE: i32 = 16 * 1024 * 1024; @@ -73,30 +71,6 @@ enum DeserializerHint { RawBson, } -/// Run the provided closure, ensuring that over the course of its execution, exactly `length` bytes -/// were read from the reader. -pub(crate) fn ensure_read_exactly( - reader: &mut R, - length: usize, - error_message: &str, - func: F, -) -> Result<()> -where - F: FnOnce(&mut std::io::Cursor>) -> Result<()>, - R: Read + ?Sized, -{ - let mut buf = vec![0u8; length]; - reader.read_exact(&mut buf)?; - let mut cursor = std::io::Cursor::new(buf); - - func(&mut cursor)?; - - if cursor.position() != length as u64 { - return Err(Error::invalid_length(length, &error_message)); - } - Ok(()) -} - pub(crate) fn read_string(reader: &mut R, utf8_lossy: bool) -> Result { let len = read_i32(reader)?; @@ -141,20 +115,6 @@ pub(crate) fn read_bool(mut reader: R) -> Result { Ok(val != 0) } -fn read_cstring(reader: &mut R) -> Result { - let mut v = Vec::new(); - - loop { - let c = read_u8(reader)?; - if c == 0 { - break; - } - v.push(c); - } - - Ok(String::from_utf8(v)?) -} - #[inline] pub(crate) fn read_u8(reader: &mut R) -> Result { let mut buf = [0; 1]; @@ -192,138 +152,6 @@ fn read_f128(reader: &mut R) -> Result { Ok(Decimal128 { bytes: buf }) } -fn deserialize_array(reader: &mut R, utf8_lossy: bool) -> Result { - let mut arr = Array::new(); - let length = read_i32(reader)?; - - if !(MIN_BSON_DOCUMENT_SIZE..=MAX_BSON_SIZE).contains(&length) { - return Err(Error::invalid_length( - length as usize, - &format!( - "array length must be between {} and {}", - MIN_BSON_DOCUMENT_SIZE, MAX_BSON_SIZE - ) - .as_str(), - )); - } - - ensure_read_exactly( - reader, - (length as usize) - 4, - "array length longer than contents", - |cursor| { - loop { - let tag = read_u8(cursor)?; - if tag == 0 { - break; - } - - let (_, val) = deserialize_bson_kvp(cursor, tag, utf8_lossy)?; - arr.push(val) - } - Ok(()) - }, - )?; - - Ok(arr) -} - -pub(crate) fn deserialize_bson_kvp( - reader: &mut R, - tag: u8, - utf8_lossy: bool, -) -> Result<(String, Bson)> { - use spec::ElementType; - let key = read_cstring(reader)?; - - let val = match ElementType::from(tag) { - Some(ElementType::Double) => Bson::Double(read_f64(reader)?), - Some(ElementType::String) => read_string(reader, utf8_lossy).map(Bson::String)?, - Some(ElementType::EmbeddedDocument) => Document::from_reader(reader).map(Bson::Document)?, - Some(ElementType::Array) => deserialize_array(reader, utf8_lossy).map(Bson::Array)?, - Some(ElementType::Binary) => Bson::Binary(Binary::from_reader(reader)?), - Some(ElementType::ObjectId) => { - let mut objid = [0; 12]; - for x in &mut objid { - *x = read_u8(reader)?; - } - Bson::ObjectId(oid::ObjectId::from_bytes(objid)) - } - Some(ElementType::Boolean) => Bson::Boolean(read_bool(reader)?), - Some(ElementType::Null) => Bson::Null, - Some(ElementType::RegularExpression) => { - Bson::RegularExpression(Regex::from_reader(reader)?) - } - Some(ElementType::JavaScriptCode) => { - read_string(reader, utf8_lossy).map(Bson::JavaScriptCode)? - } - Some(ElementType::JavaScriptCodeWithScope) => { - Bson::JavaScriptCodeWithScope(JavaScriptCodeWithScope::from_reader(reader, utf8_lossy)?) - } - Some(ElementType::Int32) => read_i32(reader).map(Bson::Int32)?, - Some(ElementType::Int64) => read_i64(reader).map(Bson::Int64)?, - Some(ElementType::Timestamp) => Bson::Timestamp(Timestamp::from_reader(reader)?), - Some(ElementType::DateTime) => { - // The int64 is UTC milliseconds since the Unix epoch. - let time = read_i64(reader)?; - Bson::DateTime(crate::DateTime::from_millis(time)) - } - Some(ElementType::Symbol) => read_string(reader, utf8_lossy).map(Bson::Symbol)?, - Some(ElementType::Decimal128) => read_f128(reader).map(Bson::Decimal128)?, - Some(ElementType::Undefined) => Bson::Undefined, - Some(ElementType::DbPointer) => Bson::DbPointer(DbPointer::from_reader(reader)?), - Some(ElementType::MaxKey) => Bson::MaxKey, - Some(ElementType::MinKey) => Bson::MinKey, - None => { - return Err(Error::UnrecognizedDocumentElementType { - key, - element_type: tag, - }) - } - }; - - Ok((key, val)) -} - -impl Binary { - pub(crate) fn from_reader(mut reader: R) -> Result { - let mut len = read_i32(&mut reader)?; - if !(0..=MAX_BSON_SIZE).contains(&len) { - return Err(Error::invalid_length( - len as usize, - &format!("binary length must be between 0 and {}", MAX_BSON_SIZE).as_str(), - )); - } - let subtype = BinarySubtype::from(read_u8(&mut reader)?); - - // Skip length data in old binary. - if let BinarySubtype::BinaryOld = subtype { - let data_len = read_i32(&mut reader)?; - - if !(0..=(MAX_BSON_SIZE - 4)).contains(&data_len) { - return Err(Error::invalid_length( - data_len as usize, - &format!("0x02 length must be between 0 and {}", MAX_BSON_SIZE - 4).as_str(), - )); - } - - if data_len + 4 != len { - return Err(Error::invalid_length( - data_len as usize, - &"0x02 length did not match top level binary length", - )); - } - - len -= 4; - } - - let mut bytes = Vec::with_capacity(len as usize); - - reader.take(len as u64).read_to_end(&mut bytes)?; - Ok(Binary { subtype, bytes }) - } -} - impl<'a> RawBinaryRef<'a> { pub(crate) fn from_slice_with_len_and_payload( mut bytes: &'a [u8], @@ -368,26 +196,6 @@ impl<'a> RawBinaryRef<'a> { } } -impl DbPointer { - pub(crate) fn from_reader(mut reader: R) -> Result { - let ns = read_string(&mut reader, false)?; - let oid = ObjectId::from_reader(&mut reader)?; - Ok(DbPointer { - namespace: ns, - id: oid, - }) - } -} - -impl Regex { - pub(crate) fn from_reader(mut reader: R) -> Result { - let pattern = read_cstring(&mut reader)?; - let options = read_cstring(&mut reader)?; - - Ok(Regex { pattern, options }) - } -} - impl Timestamp { pub(crate) fn from_reader(mut reader: R) -> Result { read_i64(&mut reader).map(Timestamp::from_le_i64) @@ -402,35 +210,6 @@ impl ObjectId { } } -impl JavaScriptCodeWithScope { - pub(crate) fn from_reader(mut reader: R, utf8_lossy: bool) -> Result { - let length = read_i32(&mut reader)?; - if length < MIN_CODE_WITH_SCOPE_SIZE { - return Err(Error::invalid_length( - length as usize, - &format!( - "code with scope length must be at least {}", - MIN_CODE_WITH_SCOPE_SIZE - ) - .as_str(), - )); - } else if length > MAX_BSON_SIZE { - return Err(Error::invalid_length( - length as usize, - &"code with scope length too large", - )); - } - - let mut buf = vec![0u8; (length - 4) as usize]; - reader.read_exact(&mut buf)?; - - let mut slice = buf.as_slice(); - let code = read_string(&mut slice, utf8_lossy)?; - let scope = Document::from_reader(&mut slice)?; - Ok(JavaScriptCodeWithScope { code, scope }) - } -} - /// Deserialize a `T` from the provided [`Bson`] value. /// /// The [`Deserializer`] used by this function presents itself as human readable, whereas the diff --git a/src/document.rs b/src/document.rs index aa4ee90c..f11643d9 100644 --- a/src/document.rs +++ b/src/document.rs @@ -1,22 +1,21 @@ //! A BSON document represented as an associative HashMap with insertion ordering. use std::{ + convert::TryInto, error, fmt::{self, Debug, Display, Formatter}, io::{Read, Write}, iter::{Extend, FromIterator, IntoIterator}, - mem, }; use ahash::RandomState; use indexmap::IndexMap; -use serde::de::Error; +use serde::{de::Error, Deserialize}; use crate::{ bson::{Array, Bson, Timestamp}, - de::{deserialize_bson_kvp, ensure_read_exactly, read_i32, MIN_BSON_DOCUMENT_SIZE}, + de::{read_i32, MIN_BSON_DOCUMENT_SIZE}, oid::ObjectId, - ser::{serialize_bson, write_i32}, spec::BinarySubtype, Binary, Decimal128, @@ -543,26 +542,12 @@ impl Document { /// # } /// ``` pub fn to_writer(&self, mut writer: W) -> crate::ser::Result<()> { - // let mut buf = Vec::new(); - // for (key, val) in self.into_iter() { - // serialize_bson(&mut buf, key.as_ref(), val)?; - // } - // - // write_i32( - // &mut writer, - // (buf.len() + mem::size_of::() + mem::size_of::()) as i32, - // )?; - // writer.write_all(&buf)?; - // writer.write_all(&[0])?; - // Ok(()) let buf = crate::to_vec(self)?; writer.write_all(&buf)?; Ok(()) } fn decode(reader: &mut R, utf_lossy: bool) -> crate::de::Result { - let mut doc = Document::new(); - let length = read_i32(reader)?; if length < MIN_BSON_DOCUMENT_SIZE { return Err(crate::de::Error::invalid_length( @@ -570,29 +555,17 @@ impl Document { &"document length must be at least 5", )); } - - ensure_read_exactly( - reader, - (length as usize) - 4, - "document length longer than contents", - |cursor| { - loop { - let mut tag_byte = [0]; - cursor.read_exact(&mut tag_byte)?; - let tag = tag_byte[0]; - - if tag == 0 { - break; - } - - let (key, val) = deserialize_bson_kvp(cursor, tag, utf_lossy)?; - doc.insert(key, val); - } - Ok(()) - }, - )?; - - Ok(doc) + let ulen: usize = + length + .try_into() + .map_err(|e| crate::de::Error::DeserializationError { + message: format!("invalid document length: {}", e), + })?; + let mut buf = vec![0u8; ulen]; + buf[0..4].copy_from_slice(&length.to_le_bytes()); + reader.read_exact(&mut buf[4..])?; + let mut deserializer = crate::de::RawDeserializer::new(&buf, utf_lossy); + Document::deserialize(&mut deserializer) } /// Attempts to deserialize a [`Document`] from a byte stream. diff --git a/src/ser/mod.rs b/src/ser/mod.rs index a15065e8..6adf87ec 100644 --- a/src/ser/mod.rs +++ b/src/ser/mod.rs @@ -30,13 +30,12 @@ pub use self::{ serde::{Serializer, SerializerOptions}, }; -use std::{io::Write, iter::FromIterator, mem}; +use std::io::Write; use crate::{ - bson::{Bson, DbPointer, Document, JavaScriptCodeWithScope, Regex}, + bson::{Bson, Document}, de::MAX_BSON_SIZE, spec::BinarySubtype, - Binary, RawDocumentBuf, }; use ::serde::{ser::Error as SerdeError, Serialize}; @@ -106,83 +105,6 @@ fn write_binary(mut writer: W, bytes: &[u8], subtype: BinarySubtype) - writer.write_all(bytes).map_err(From::from) } -fn serialize_array(writer: &mut W, arr: &[Bson]) -> Result<()> { - let mut buf = Vec::new(); - for (key, val) in arr.iter().enumerate() { - serialize_bson(&mut buf, &key.to_string(), val)?; - } - - write_i32( - writer, - (buf.len() + mem::size_of::() + mem::size_of::()) as i32, - )?; - writer.write_all(&buf)?; - writer.write_all(b"\0")?; - Ok(()) -} - -pub(crate) fn serialize_bson( - writer: &mut W, - key: &str, - val: &Bson, -) -> Result<()> { - writer.write_all(&[val.element_type() as u8])?; - write_cstring(writer, key)?; - - match *val { - Bson::Double(v) => write_f64(writer, v), - Bson::String(ref v) => write_string(writer, v), - Bson::Array(ref v) => serialize_array(writer, v), - Bson::Document(ref v) => v.to_writer(writer), - Bson::Boolean(v) => writer.write_all(&[v as u8]).map_err(From::from), - Bson::RegularExpression(Regex { - ref pattern, - ref options, - }) => { - write_cstring(writer, pattern)?; - - let mut chars: Vec = options.chars().collect(); - chars.sort_unstable(); - - write_cstring(writer, String::from_iter(chars).as_str()) - } - Bson::JavaScriptCode(ref code) => write_string(writer, code), - Bson::ObjectId(ref id) => writer.write_all(&id.bytes()).map_err(From::from), - Bson::JavaScriptCodeWithScope(JavaScriptCodeWithScope { - ref code, - ref scope, - }) => { - let mut buf = Vec::new(); - write_string(&mut buf, code)?; - scope.to_writer(&mut buf)?; - - write_i32(writer, buf.len() as i32 + 4)?; - writer.write_all(&buf).map_err(From::from) - } - Bson::Int32(v) => write_i32(writer, v), - Bson::Int64(v) => write_i64(writer, v), - Bson::Timestamp(ts) => write_i64(writer, ts.to_le_i64()), - Bson::Binary(Binary { subtype, ref bytes }) => write_binary(writer, bytes, subtype), - Bson::DateTime(ref v) => write_i64(writer, v.timestamp_millis()), - Bson::Null => Ok(()), - Bson::Symbol(ref v) => write_string(writer, v), - Bson::Decimal128(ref v) => { - writer.write_all(&v.bytes)?; - Ok(()) - } - Bson::Undefined => Ok(()), - Bson::MinKey => Ok(()), - Bson::MaxKey => Ok(()), - Bson::DbPointer(DbPointer { - ref namespace, - ref id, - }) => { - write_string(writer, namespace)?; - writer.write_all(&id.bytes()).map_err(From::from) - } - } -} - /// Encode a `T` Serializable into a [`Bson`] value. /// /// The [`Serializer`] used by this function presents itself as human readable, whereas the From ee4f236e47aeb263041b9e208b900461a269a09d Mon Sep 17 00:00:00 2001 From: Abraham Egnor Date: Mon, 25 Mar 2024 17:02:52 -0400 Subject: [PATCH 3/4] fix? timestamp byte parsing --- src/bson.rs | 24 +++++++++++++----------- src/de/mod.rs | 4 +++- src/de/raw.rs | 2 +- src/de/serde.rs | 2 +- src/extjson/models.rs | 2 +- src/raw/document_buf.rs | 2 +- 6 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/bson.rs b/src/bson.rs index 3a455f4e..6c8c905e 100644 --- a/src/bson.rs +++ b/src/bson.rs @@ -1029,19 +1029,21 @@ impl Display for Timestamp { } impl Timestamp { - pub(crate) fn to_le_i64(self) -> i64 { - let upper = (self.time.to_le() as u64) << 32; - let lower = self.increment.to_le() as u64; - - (upper | lower) as i64 + pub(crate) fn to_le_bytes(self) -> [u8; 8] { + let mut out = [0; 8]; + out[0..4].copy_from_slice(&self.increment.to_le_bytes()); + out[4..8].copy_from_slice(&self.time.to_le_bytes()); + out } - pub(crate) fn from_le_i64(val: i64) -> Self { - let ts = val.to_le(); - - Timestamp { - time: ((ts as u64) >> 32) as u32, - increment: (ts & 0xFFFF_FFFF) as u32, + pub(crate) fn from_le_bytes(bytes: [u8; 8]) -> Self { + let mut inc_bytes = [0; 4]; + inc_bytes.copy_from_slice(&bytes[0..4]); + let mut time_bytes = [0; 4]; + time_bytes.copy_from_slice(&bytes[4..8]); + Self { + increment: u32::from_le_bytes(inc_bytes), + time: u32::from_le_bytes(time_bytes), } } } diff --git a/src/de/mod.rs b/src/de/mod.rs index e177fa20..eeaf6beb 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -198,7 +198,9 @@ impl<'a> RawBinaryRef<'a> { impl Timestamp { pub(crate) fn from_reader(mut reader: R) -> Result { - read_i64(&mut reader).map(Timestamp::from_le_i64) + let mut bytes = [0; 8]; + reader.read_exact(&mut bytes)?; + Ok(Timestamp::from_le_bytes(bytes)) } } diff --git a/src/de/raw.rs b/src/de/raw.rs index e338bf7d..e7f42920 100644 --- a/src/de/raw.rs +++ b/src/de/raw.rs @@ -219,7 +219,7 @@ impl<'de> Deserializer<'de> { match self.current_type { ElementType::Int32 => visitor.visit_i32(read_i32(&mut self.bytes)?), - ElementType::Int64 => visitor.visit_i64(read_i64(&mut self.bytes)?), + ElementType::Int64 => visitor.visit_i64(dbg!(read_i64(&mut self.bytes)?)), ElementType::Double => visitor.visit_f64(read_f64(&mut self.bytes)?), ElementType::String => match self.deserialize_str()? { Cow::Borrowed(s) => visitor.visit_borrowed_str(s), diff --git a/src/de/serde.rs b/src/de/serde.rs index 0ab77187..85a29a43 100644 --- a/src/de/serde.rs +++ b/src/de/serde.rs @@ -415,7 +415,7 @@ impl<'de> Visitor<'de> for BsonVisitor { } "$timestamp" => { - let ts = visitor.next_value::()?; + let ts = dbg!(visitor.next_value::()?); return Ok(Bson::Timestamp(Timestamp { time: ts.t, increment: ts.i, diff --git a/src/extjson/models.rs b/src/extjson/models.rs index efb57fff..0bae6249 100644 --- a/src/extjson/models.rs +++ b/src/extjson/models.rs @@ -225,7 +225,7 @@ pub(crate) struct Timestamp { body: TimestampBody, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub(crate) struct TimestampBody { #[serde(serialize_with = "crate::serde_helpers::serialize_u32_as_i64")] diff --git a/src/raw/document_buf.rs b/src/raw/document_buf.rs index 63e4f4bf..6a508a35 100644 --- a/src/raw/document_buf.rs +++ b/src/raw/document_buf.rs @@ -300,7 +300,7 @@ impl RawDocumentBuf { self.data.extend(code_w_scope.scope.as_bytes()); } RawBsonRef::Timestamp(ts) => { - self.data.extend(ts.to_le_i64().to_le_bytes()); + self.data.extend(ts.to_le_bytes()); } RawBsonRef::ObjectId(oid) => { self.data.extend(oid.bytes()); From a54f496adee68e27d5b12174e6f803f47aa2c392 Mon Sep 17 00:00:00 2001 From: Abraham Egnor Date: Mon, 25 Mar 2024 17:07:57 -0400 Subject: [PATCH 4/4] remove debug --- src/de/raw.rs | 2 +- src/de/serde.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/de/raw.rs b/src/de/raw.rs index e7f42920..e338bf7d 100644 --- a/src/de/raw.rs +++ b/src/de/raw.rs @@ -219,7 +219,7 @@ impl<'de> Deserializer<'de> { match self.current_type { ElementType::Int32 => visitor.visit_i32(read_i32(&mut self.bytes)?), - ElementType::Int64 => visitor.visit_i64(dbg!(read_i64(&mut self.bytes)?)), + ElementType::Int64 => visitor.visit_i64(read_i64(&mut self.bytes)?), ElementType::Double => visitor.visit_f64(read_f64(&mut self.bytes)?), ElementType::String => match self.deserialize_str()? { Cow::Borrowed(s) => visitor.visit_borrowed_str(s), diff --git a/src/de/serde.rs b/src/de/serde.rs index 85a29a43..0ab77187 100644 --- a/src/de/serde.rs +++ b/src/de/serde.rs @@ -415,7 +415,7 @@ impl<'de> Visitor<'de> for BsonVisitor { } "$timestamp" => { - let ts = dbg!(visitor.next_value::()?); + let ts = visitor.next_value::()?; return Ok(Bson::Timestamp(Timestamp { time: ts.t, increment: ts.i,