Rollup merge of rust-lang#26950 - AlisdairO:memset, r=alexcrichton

Manishearth · Manishearth · commit 12899e73a08b · 2015-07-16T14:13:38.000+05:30
In general, it's undesirable to have read_to_end use a buffer with uninitialized memory, as that could lead to undefined behaviour in the event of a bad Read implementation.  Since we control the implementations of Read for Stdin and File, however, it should be okay for us to specialise them to improve performance.  This PR is to do that!

Adds some unsafe code to deal with creating the buffers.  Since the read_to_end function needed to be used from the io and fs crates, I moved it into a newly-created sys::common::io module.  Alternatively we could expose the new read_to_end functions to allow people to create their own read_to_end implementations for code they trust.

Benchmarks:

Read a 2.5MB file:
sys_common::io::tests::bench_init_file      ... bench:  27,473,317 ns/iter (+/- 2,490,767)
sys_common::io::tests::bench_uninit_file    ... bench:  25,611,793 ns/iter (+/- 2,137,387)

Read a buffer full of constant values
sys_common::io::tests::bench_uninitialized  ... bench:  12,877,645 ns/iter (+/- 931,025)
sys_common::io::tests::bench_zeroed         ... bench:  18,581,082 ns/iter (+/- 1,541,108)

So, approx a 7% speedup for file reading, which I think is worthwhile.
diff --git a/src/libstd/fs.rs b/src/libstd/fs.rs
@@ -25,6 +25,7 @@ use io::{self, SeekFrom, Seek, Read, Write};
 use path::{Path, PathBuf};
 use sys::fs as fs_imp;
 use sys_common::{AsInnerMut, FromInner, AsInner};
+use sys_common::io::read_to_end_uninitialized;
 use vec::Vec;
 
 /// A reference to an open file on the filesystem.
@@ -328,6 +329,9 @@ impl Read for File {
     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
         self.inner.read(buf)
     }
+    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
+        unsafe { read_to_end_uninitialized(self, buf) }
+    }
 }
 #[stable(feature = "rust1", since = "1.0.0")]
 impl Write for File {
diff --git a/src/libstd/io/mod.rs b/src/libstd/io/mod.rs
@@ -906,6 +906,8 @@ mod tests {
     use io::prelude::*;
     use io;
     use super::Cursor;
+    use test;
+    use super::repeat;
 
     #[test]
     fn read_until() {
@@ -1024,4 +1026,13 @@ mod tests {
         let mut buf = [0; 1];
         assert_eq!(0, R.take(0).read(&mut buf).unwrap());
     }
+
+    #[bench]
+    fn bench_read_to_end(b: &mut test::Bencher) {
+        b.iter(|| {
+            let mut lr = repeat(1).take(10000000);
+            let mut vec = Vec::with_capacity(1024);
+            super::read_to_end(&mut lr, &mut vec);
+        });
+    }
 }
diff --git a/src/libstd/io/stdio.rs b/src/libstd/io/stdio.rs
@@ -18,6 +18,7 @@ use io::lazy::Lazy;
 use io::{self, BufReader, LineWriter};
 use sync::{Arc, Mutex, MutexGuard};
 use sys::stdio;
+use sys_common::io::{read_to_end_uninitialized};
 use sys_common::remutex::{ReentrantMutex, ReentrantMutexGuard};
 use libc;
 
@@ -277,6 +278,9 @@ impl<'a> Read for StdinLock<'a> {
     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
         self.inner.read(buf)
     }
+    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
+        unsafe { read_to_end_uninitialized(self, buf) }
+    }
 }
 
 #[stable(feature = "rust1", since = "1.0.0")]
diff --git a/src/libstd/net/tcp.rs b/src/libstd/net/tcp.rs
@@ -19,6 +19,7 @@ use io;
 use net::{ToSocketAddrs, SocketAddr, Shutdown};
 use sys_common::net as net_imp;
 use sys_common::{AsInner, FromInner};
+use sys_common::io::read_to_end_uninitialized;
 use time::Duration;
 
 /// A structure which represents a TCP stream between a local socket and a
@@ -189,6 +190,9 @@ impl TcpStream {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl Read for TcpStream {
     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { self.0.read(buf) }
+    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
+        unsafe { read_to_end_uninitialized(self, buf) }
+    }
 }
 #[stable(feature = "rust1", since = "1.0.0")]
 impl Write for TcpStream {
@@ -198,6 +202,9 @@ impl Write for TcpStream {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<'a> Read for &'a TcpStream {
     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { self.0.read(buf) }
+    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
+        unsafe { read_to_end_uninitialized(self, buf) }
+    }
 }
 #[stable(feature = "rust1", since = "1.0.0")]
 impl<'a> Write for &'a TcpStream {
diff --git a/src/libstd/sys/common/io.rs b/src/libstd/sys/common/io.rs
@@ -0,0 +1,139 @@
+// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+use prelude::v1::*;
+use io;
+use io::ErrorKind;
+use io::Read;
+use slice::from_raw_parts_mut;
+
+// Provides read_to_end functionality over an uninitialized buffer.
+// This function is unsafe because it calls the underlying
+// read function with a slice into uninitialized memory. The default
+// implementation of read_to_end for readers will zero out new memory in
+// the buf before passing it to read, but avoiding this zero can often
+// lead to a fairly significant performance win.
+//
+// Implementations using this method have to adhere to two guarantees:
+//  *  The implementation of read never reads the buffer provided.
+//  *  The implementation of read correctly reports how many bytes were written.
+pub unsafe fn read_to_end_uninitialized(r: &mut Read, buf: &mut Vec<u8>) -> io::Result<usize> {
+
+    let start_len = buf.len();
+    buf.reserve(16);
+
+    // Always try to read into the empty space of the vector (from the length to the capacity).
+    // If the vector ever fills up then we reserve an extra byte which should trigger the normal
+    // reallocation routines for the vector, which will likely double the size.
+    //
+    // This function is similar to the read_to_end function in std::io, but the logic about
+    // reservations and slicing is different enough that this is duplicated here.
+    loop {
+        if buf.len() == buf.capacity() {
+            buf.reserve(1);
+        }
+
+        let buf_slice = from_raw_parts_mut(buf.as_mut_ptr().offset(buf.len() as isize),
+                                           buf.capacity() - buf.len());
+
+        match r.read(buf_slice) {
+            Ok(0) => { return Ok(buf.len() - start_len); }
+            Ok(n) => { let len = buf.len() + n; buf.set_len(len); },
+            Err(ref e) if e.kind() == ErrorKind::Interrupted => { }
+            Err(e) => { return Err(e); }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use prelude::v1::*;
+    use io::prelude::*;
+    use super::*;
+    use io;
+    use io::{ErrorKind, Take, Repeat, repeat};
+    use test;
+    use slice::from_raw_parts;
+
+    struct ErrorRepeat {
+        lr: Take<Repeat>
+    }
+
+    fn error_repeat(byte: u8, limit: u64) -> ErrorRepeat {
+        ErrorRepeat { lr: repeat(byte).take(limit) }
+    }
+
+    impl Read for ErrorRepeat {
+        fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+            let ret = self.lr.read(buf);
+            if let Ok(0) = ret {
+                return Err(io::Error::new(ErrorKind::Other, ""))
+            }
+            ret
+        }
+    }
+
+    fn init_vec_data() -> Vec<u8> {
+        let mut vec = vec![10u8; 200];
+        unsafe { vec.set_len(0); }
+        vec
+    }
+
+    fn assert_all_eq(buf: &[u8], value: u8) {
+        for n in buf {
+            assert_eq!(*n, value);
+        }
+    }
+
+    fn validate(buf: &Vec<u8>, good_read_len: usize) {
+        assert_all_eq(buf, 1u8);
+        let cap = buf.capacity();
+        let end_slice = unsafe { from_raw_parts(buf.as_ptr().offset(good_read_len as isize),
+                                                    cap - good_read_len) };
+        assert_all_eq(end_slice, 10u8);
+    }
+
+    #[test]
+    fn read_to_end_uninit_error() {
+        let mut er = error_repeat(1,100);
+        let mut vec = init_vec_data();
+        if let Err(_) = unsafe { read_to_end_uninitialized(&mut er, &mut vec) } {
+            validate(&vec, 100);
+        } else {
+            assert!(false);
+        }
+    }
+
+    #[test]
+    fn read_to_end_uninit_zero_len_vec() {
+        let mut er = repeat(1).take(100);
+        let mut vec = Vec::new();
+        let n = unsafe{ read_to_end_uninitialized(&mut er, &mut vec).unwrap() };
+        assert_all_eq(&vec, 1u8);
+        assert_eq!(vec.len(), n);
+    }
+
+    #[test]
+    fn read_to_end_uninit_good() {
+        let mut er = repeat(1).take(100);
+        let mut vec = init_vec_data();
+        let n = unsafe{ read_to_end_uninitialized(&mut er, &mut vec).unwrap() };
+        validate(&vec, 100);
+        assert_eq!(vec.len(), n);
+    }
+
+    #[bench]
+    fn bench_uninitialized(b: &mut test::Bencher) {
+        b.iter(|| {
+            let mut lr = repeat(1).take(10000000);
+            let mut vec = Vec::with_capacity(1024);
+            unsafe { read_to_end_uninitialized(&mut lr, &mut vec) };
+        });
+    }
+}
diff --git a/src/libstd/sys/common/mod.rs b/src/libstd/sys/common/mod.rs
@@ -16,6 +16,7 @@ pub mod backtrace;
 pub mod condvar;
 pub mod mutex;
 pub mod net;
+pub mod io;
 pub mod poison;
 pub mod remutex;
 pub mod rwlock;