Skip to content

Commit a602d15

Browse files
committed
SipHasher128: improve constant names and add more comments
1 parent 581cc4a commit a602d15

File tree

2 files changed

+68
-38
lines changed

2 files changed

+68
-38
lines changed

Diff for: compiler/rustc_data_structures/src/sip128.rs

+66-36
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,34 @@ use std::ptr;
77
#[cfg(test)]
88
mod tests;
99

10+
// The SipHash algorithm operates on 8-byte chunks.
1011
const ELEM_SIZE: usize = mem::size_of::<u64>();
11-
const BUFFER_SIZE_ELEMS: usize = 8;
12-
const BUFFER_SIZE_BYTES: usize = BUFFER_SIZE_ELEMS * ELEM_SIZE;
13-
const BUFFER_SIZE_ELEMS_SPILL: usize = BUFFER_SIZE_ELEMS + 1;
14-
const BUFFER_SIZE_BYTES_SPILL: usize = BUFFER_SIZE_ELEMS_SPILL * ELEM_SIZE;
15-
const BUFFER_SPILL_INDEX: usize = BUFFER_SIZE_ELEMS_SPILL - 1;
12+
13+
// Size of the buffer in number of elements, not including the spill.
14+
//
15+
// The selection of this size was guided by rustc-perf benchmark comparisons of
16+
// different buffer sizes. It should be periodically reevaluated as the compiler
17+
// implementation and input characteristics change.
18+
//
19+
// Using the same-sized buffer for everything we hash is a performance versus
20+
// complexity tradeoff. The ideal buffer size, and whether buffering should even
21+
// be used, depends on what is being hashed. It may be worth it to size the
22+
// buffer appropriately (perhaps by making SipHasher128 generic over the buffer
23+
// size) or disable buffering depending on what is being hashed. But at this
24+
// time, we use the same buffer size for everything.
25+
const BUFFER_CAPACITY: usize = 8;
26+
27+
// Size of the buffer in bytes, not including the spill.
28+
const BUFFER_SIZE: usize = BUFFER_CAPACITY * ELEM_SIZE;
29+
30+
// Size of the buffer in number of elements, including the spill.
31+
const BUFFER_WITH_SPILL_CAPACITY: usize = BUFFER_CAPACITY + 1;
32+
33+
// Size of the buffer in bytes, including the spill.
34+
const BUFFER_WITH_SPILL_SIZE: usize = BUFFER_WITH_SPILL_CAPACITY * ELEM_SIZE;
35+
36+
// Index of the spill element in the buffer.
37+
const BUFFER_SPILL_INDEX: usize = BUFFER_WITH_SPILL_CAPACITY - 1;
1638

1739
#[derive(Debug, Clone)]
1840
#[repr(C)]
@@ -22,10 +44,10 @@ pub struct SipHasher128 {
2244
// `processed`, and then repetition of that pattern until hashing is done.
2345
// This is the basis for the ordering of fields below. However, in practice
2446
// the cache miss-rate for data access is extremely low regardless of order.
25-
nbuf: usize, // how many bytes in buf are valid
26-
buf: [MaybeUninit<u64>; BUFFER_SIZE_ELEMS_SPILL], // unprocessed bytes le
27-
state: State, // hash State
28-
processed: usize, // how many bytes we've processed
47+
nbuf: usize, // how many bytes in buf are valid
48+
buf: [MaybeUninit<u64>; BUFFER_WITH_SPILL_CAPACITY], // unprocessed bytes le
49+
state: State, // hash State
50+
processed: usize, // how many bytes we've processed
2951
}
3052

3153
#[derive(Debug, Clone, Copy)]
@@ -64,13 +86,18 @@ macro_rules! compress {
6486
// Copies up to 8 bytes from source to destination. This performs better than
6587
// `ptr::copy_nonoverlapping` on microbenchmarks and may perform better on real
6688
// workloads since all of the copies have fixed sizes and avoid calling memcpy.
89+
//
90+
// This is specifically designed for copies of up to 8 bytes, because that's the
91+
// maximum of number bytes needed to fill an 8-byte-sized element on which
92+
// SipHash operates. Note that for variable-sized copies which are known to be
93+
// less than 8 bytes, this function will perform more work than necessary unless
94+
// the compiler is able to optimize the extra work away.
6795
#[inline]
6896
unsafe fn copy_nonoverlapping_small(src: *const u8, dst: *mut u8, count: usize) {
69-
const COUNT_MAX: usize = 8;
70-
debug_assert!(count <= COUNT_MAX);
97+
debug_assert!(count <= 8);
7198

72-
if count == COUNT_MAX {
73-
ptr::copy_nonoverlapping(src, dst, COUNT_MAX);
99+
if count == 8 {
100+
ptr::copy_nonoverlapping(src, dst, 8);
74101
return;
75102
}
76103

@@ -116,10 +143,13 @@ unsafe fn copy_nonoverlapping_small(src: *const u8, dst: *mut u8, count: usize)
116143
// The buffer includes a "spill"--an extra element at the end--which simplifies
117144
// the integer write buffer processing path. The value that fills the buffer can
118145
// be written with a statically sized write that may spill over into the spill.
119-
// After the buffer is processed, the part of the value that spilled over can
146+
// After the buffer is processed, the part of the value that spilled over can be
120147
// written from the spill to the beginning of the buffer with another statically
121-
// sized write. Due to static sizes, this scheme performs better than copying
122-
// the exact number of bytes needed into the end and beginning of the buffer.
148+
// sized write. This write may copy more bytes than actually spilled over, but
149+
// we maintain the metadata such that any extra copied bytes will be ignored by
150+
// subsequent processing. Due to the static sizes, this scheme performs better
151+
// than copying the exact number of bytes needed into the end and beginning of
152+
// the buffer.
123153
//
124154
// The buffer is uninitialized, which improves performance, but may preclude
125155
// efficient implementation of alternative approaches. The improvement is not so
@@ -142,12 +172,12 @@ unsafe fn copy_nonoverlapping_small(src: *const u8, dst: *mut u8, count: usize)
142172
//
143173
// In order to make `SipHasher128` consistent with `SipHasher` in libstd, we
144174
// choose to do the integer to byte sequence conversion in the platform-
145-
// dependent way. Clients can achieve (nearly) platform-independent hashing by
146-
// widening `isize` and `usize` integers to 64 bits on 32-bit systems and
147-
// byte-swapping integers on big-endian systems before passing them to the
148-
// writing functions. This causes the input byte sequence to look identical on
149-
// big- and little- endian systems (supposing `isize` and `usize` values can be
150-
// represented in 32 bits), which ensures platform-independent results.
175+
// dependent way. Clients can achieve platform-independent hashing by widening
176+
// `isize` and `usize` integers to 64 bits on 32-bit systems and byte-swapping
177+
// integers on big-endian systems before passing them to the writing functions.
178+
// This causes the input byte sequence to look identical on big- and little-
179+
// endian systems (supposing `isize` and `usize` values can be represented in 32
180+
// bits), which ensures platform-independent results.
151181
impl SipHasher128 {
152182
#[inline]
153183
pub fn new_with_keys(key0: u64, key1: u64) -> SipHasher128 {
@@ -178,10 +208,10 @@ impl SipHasher128 {
178208
let size = mem::size_of::<T>();
179209
let nbuf = self.nbuf;
180210
debug_assert!(size <= 8);
181-
debug_assert!(nbuf < BUFFER_SIZE_BYTES);
182-
debug_assert!(nbuf + size < BUFFER_SIZE_BYTES_SPILL);
211+
debug_assert!(nbuf < BUFFER_SIZE);
212+
debug_assert!(nbuf + size < BUFFER_WITH_SPILL_SIZE);
183213

184-
if nbuf + size < BUFFER_SIZE_BYTES {
214+
if nbuf + size < BUFFER_SIZE {
185215
unsafe {
186216
// The memcpy call is optimized away because the size is known.
187217
let dst = (self.buf.as_mut_ptr() as *mut u8).add(nbuf);
@@ -207,17 +237,17 @@ impl SipHasher128 {
207237
let size = mem::size_of::<T>();
208238
let nbuf = self.nbuf;
209239
debug_assert!(size <= 8);
210-
debug_assert!(nbuf < BUFFER_SIZE_BYTES);
211-
debug_assert!(nbuf + size >= BUFFER_SIZE_BYTES);
212-
debug_assert!(nbuf + size < BUFFER_SIZE_BYTES_SPILL);
240+
debug_assert!(nbuf < BUFFER_SIZE);
241+
debug_assert!(nbuf + size >= BUFFER_SIZE);
242+
debug_assert!(nbuf + size < BUFFER_WITH_SPILL_SIZE);
213243

214244
// Copy first part of input into end of buffer, possibly into spill
215245
// element. The memcpy call is optimized away because the size is known.
216246
let dst = (self.buf.as_mut_ptr() as *mut u8).add(nbuf);
217247
ptr::copy_nonoverlapping(&x as *const _ as *const u8, dst, size);
218248

219249
// Process buffer.
220-
for i in 0..BUFFER_SIZE_ELEMS {
250+
for i in 0..BUFFER_CAPACITY {
221251
let elem = self.buf.get_unchecked(i).assume_init().to_le();
222252
self.state.v3 ^= elem;
223253
Sip24Rounds::c_rounds(&mut self.state);
@@ -234,18 +264,18 @@ impl SipHasher128 {
234264
// This function should only be called when the write fills the buffer.
235265
// Therefore, when size == 1, the new `self.nbuf` must be zero. The size
236266
// is statically known, so the branch is optimized away.
237-
self.nbuf = if size == 1 { 0 } else { nbuf + size - BUFFER_SIZE_BYTES };
238-
self.processed += BUFFER_SIZE_BYTES;
267+
self.nbuf = if size == 1 { 0 } else { nbuf + size - BUFFER_SIZE };
268+
self.processed += BUFFER_SIZE;
239269
}
240270

241271
// A write function for byte slices.
242272
#[inline]
243273
fn slice_write(&mut self, msg: &[u8]) {
244274
let length = msg.len();
245275
let nbuf = self.nbuf;
246-
debug_assert!(nbuf < BUFFER_SIZE_BYTES);
276+
debug_assert!(nbuf < BUFFER_SIZE);
247277

248-
if nbuf + length < BUFFER_SIZE_BYTES {
278+
if nbuf + length < BUFFER_SIZE {
249279
unsafe {
250280
let dst = (self.buf.as_mut_ptr() as *mut u8).add(nbuf);
251281

@@ -275,8 +305,8 @@ impl SipHasher128 {
275305
unsafe fn slice_write_process_buffer(&mut self, msg: &[u8]) {
276306
let length = msg.len();
277307
let nbuf = self.nbuf;
278-
debug_assert!(nbuf < BUFFER_SIZE_BYTES);
279-
debug_assert!(nbuf + length >= BUFFER_SIZE_BYTES);
308+
debug_assert!(nbuf < BUFFER_SIZE);
309+
debug_assert!(nbuf + length >= BUFFER_SIZE);
280310

281311
// Always copy first part of input into current element of buffer.
282312
// This function should only be called when the write fills the buffer,
@@ -328,7 +358,7 @@ impl SipHasher128 {
328358

329359
#[inline]
330360
pub fn finish128(mut self) -> (u64, u64) {
331-
debug_assert!(self.nbuf < BUFFER_SIZE_BYTES);
361+
debug_assert!(self.nbuf < BUFFER_SIZE);
332362

333363
// Process full elements in buffer.
334364
let last = self.nbuf / ELEM_SIZE;

Diff for: compiler/rustc_data_structures/src/sip128/tests.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -456,12 +456,12 @@ macro_rules! test_fill_buffer {
456456
// Test filling and overfilling the buffer from all possible offsets
457457
// for a given integer type and its corresponding write method.
458458
const SIZE: usize = std::mem::size_of::<$type>();
459-
let input = [42; BUFFER_SIZE_BYTES];
459+
let input = [42; BUFFER_SIZE];
460460
let x = 0x01234567_89ABCDEF_76543210_FEDCBA98_u128 as $type;
461461
let x_bytes = &x.to_ne_bytes();
462462

463463
for i in 1..=SIZE {
464-
let s = &input[..BUFFER_SIZE_BYTES - i];
464+
let s = &input[..BUFFER_SIZE - i];
465465

466466
let mut h1 = SipHasher128::new_with_keys(7, 13);
467467
h1.write(s);

0 commit comments

Comments
 (0)