|
| 1 | +use std::collections::hash_map::{DefaultHasher, RandomState}; |
| 2 | +use std::hash::{BuildHasher, Hash, Hasher}; |
| 3 | + |
| 4 | +/// A Bloom Filter <https://en.wikipedia.org/wiki/Bloom_filter> is a probabilistic data structure testing whether an element belongs to a set or not |
| 5 | +/// Therefore, its contract looks very close to the one of a set, for example a `HashSet` |
| 6 | +trait BloomFilter<Item: Hash> { |
| 7 | + fn insert(&mut self, item: Item); |
| 8 | + fn contains(&self, item: &Item) -> bool; |
| 9 | +} |
| 10 | + |
| 11 | +/// What is the point of using a Bloom Filter if it acts like a Set? |
| 12 | +/// Let's imagine we have a huge number of elements to store (like un unbounded data stream) a Set storing every element will most likely take up too much space, at some point. |
| 13 | +/// As other probabilistic data structures like Count-min Sketch, the goal of a Bloom Filter is to trade off exactitude for constant space. |
| 14 | +/// We won't have a strictly exact result of whether the value belongs to the set, but we'll use constant space instead |
| 15 | +
|
| 16 | +/// Let's start with the basic idea behind the implementation |
| 17 | +/// Let's start by trying to make a `HashSet` with constant space: |
| 18 | +/// Instead of storing every element and grow the set infinitely, let's use a vector with constant capacity `CAPACITY` |
| 19 | +/// Each element of this vector will be a boolean. |
| 20 | +/// When a new element is inserted, we hash its value and set the index at index `hash(item) % CAPACITY` to `true` |
| 21 | +/// When looking for an item, we hash its value and retrieve the boolean at index `hash(item) % CAPACITY` |
| 22 | +/// If it's `false` it's absolutely sure the item isn't present |
| 23 | +/// If it's `true` the item may be present, or maybe another one produces the same hash |
| 24 | +#[derive(Debug)] |
| 25 | +struct BasicBloomFilter<const CAPACITY: usize> { |
| 26 | + vec: [bool; CAPACITY], |
| 27 | +} |
| 28 | + |
| 29 | +impl<const CAPACITY: usize> Default for BasicBloomFilter<CAPACITY> { |
| 30 | + fn default() -> Self { |
| 31 | + Self { |
| 32 | + vec: [false; CAPACITY], |
| 33 | + } |
| 34 | + } |
| 35 | +} |
| 36 | + |
| 37 | +impl<Item: Hash, const CAPACITY: usize> BloomFilter<Item> for BasicBloomFilter<CAPACITY> { |
| 38 | + fn insert(&mut self, item: Item) { |
| 39 | + let mut hasher = DefaultHasher::new(); |
| 40 | + item.hash(&mut hasher); |
| 41 | + let idx = (hasher.finish() % CAPACITY as u64) as usize; |
| 42 | + self.vec[idx] = true; |
| 43 | + } |
| 44 | + |
| 45 | + fn contains(&self, item: &Item) -> bool { |
| 46 | + let mut hasher = DefaultHasher::new(); |
| 47 | + item.hash(&mut hasher); |
| 48 | + let idx = (hasher.finish() % CAPACITY as u64) as usize; |
| 49 | + self.vec[idx] |
| 50 | + } |
| 51 | +} |
| 52 | + |
| 53 | +/// Can we improve it? Certainly, in different ways. |
| 54 | +/// One pattern you may have identified here is that we use a "binary array" (a vector of binary values) |
| 55 | +/// For instance, we might have `[0,1,0,0,1,0]`, which is actually the binary representation of 9 |
| 56 | +/// This means we can immediately replace our `Vec<bool>` by an actual number |
| 57 | +/// What would it mean to set a `1` at index `i`? |
| 58 | +/// Imagine a `CAPACITY` of `6`. The initial value for our mask is `000000`. |
| 59 | +/// We want to store `"Bloom"`. Its hash modulo `CAPACITY` is `5`. Which means we need to set `1` at the last index. |
| 60 | +/// It can be performed by doing `000000 | 000001` |
| 61 | +/// Meaning we can hash the item value, use a modulo to find the index, and do a binary `or` between the current number and the index |
| 62 | +#[derive(Debug, Default)] |
| 63 | +struct SingleBinaryBloomFilter { |
| 64 | + fingerprint: u128, // let's use 128 bits, the equivalent of using CAPACITY=128 in the previous example |
| 65 | +} |
| 66 | + |
| 67 | +/// Given a value and a hash function, compute the hash and return the bit mask |
| 68 | +fn mask_128<T: Hash>(hasher: &mut DefaultHasher, item: T) -> u128 { |
| 69 | + item.hash(hasher); |
| 70 | + let idx = (hasher.finish() % 128) as u32; |
| 71 | + // idx is where we want to put a 1, let's convert this into a proper binary mask |
| 72 | + 2_u128.pow(idx) |
| 73 | +} |
| 74 | + |
| 75 | +impl<T: Hash> BloomFilter<T> for SingleBinaryBloomFilter { |
| 76 | + fn insert(&mut self, item: T) { |
| 77 | + self.fingerprint |= mask_128(&mut DefaultHasher::new(), &item); |
| 78 | + } |
| 79 | + |
| 80 | + fn contains(&self, item: &T) -> bool { |
| 81 | + (self.fingerprint & mask_128(&mut DefaultHasher::new(), item)) > 0 |
| 82 | + } |
| 83 | +} |
| 84 | + |
| 85 | +/// We may have made some progress in term of CPU efficiency, using binary operators. |
| 86 | +/// But we might still run into a lot of collisions with a single 128-bits number. |
| 87 | +/// Can we use greater numbers then? Currently, our implementation is limited to 128 bits. |
| 88 | +/// |
| 89 | +/// Should we go back to using an array, then? |
| 90 | +/// We could! But instead of using `Vec<bool>` we could use `Vec<u8>`. |
| 91 | +/// Each `u8` can act as a mask as we've done before, and is actually 1 byte in memory (same as a boolean!) |
| 92 | +/// That'd allow us to go over 128 bits, but would divide by 8 the memory footprint. |
| 93 | +/// That's one thing, and will involve dividing / shifting by 8 in different places. |
| 94 | +/// |
| 95 | +/// But still, can we reduce the collisions furthermore? |
| 96 | +/// |
| 97 | +/// As we did with count-min-sketch, we could use multiple hash function. |
| 98 | +/// When inserting a value, we compute its hash with every hash function (`hash_i`) and perform the same operation as above (the OR with `fingerprint`) |
| 99 | +/// Then when looking for a value, if **ANY** of the tests (`hash` then `AND`) returns 0 then this means the value is missing from the set, otherwise it would have returned 1 |
| 100 | +/// If it returns `1`, it **may** be that the item is present, but could also be a collision |
| 101 | +/// This is what a Bloom Filter is about: returning `false` means the value is necessarily absent, and returning true means it may be present |
| 102 | +pub struct MultiBinaryBloomFilter { |
| 103 | + filter_size: usize, |
| 104 | + bytes: Vec<u8>, |
| 105 | + hash_builders: Vec<RandomState>, |
| 106 | +} |
| 107 | + |
| 108 | +impl MultiBinaryBloomFilter { |
| 109 | + pub fn with_dimensions(filter_size: usize, hash_count: usize) -> Self { |
| 110 | + let bytes_count = filter_size / 8 + if filter_size % 8 > 0 { 1 } else { 0 }; // we need 8 times less entries in the array, since we are using bytes. Careful that we have at least one element though |
| 111 | + Self { |
| 112 | + filter_size, |
| 113 | + bytes: vec![0; bytes_count], |
| 114 | + hash_builders: vec![RandomState::new(); hash_count], |
| 115 | + } |
| 116 | + } |
| 117 | + |
| 118 | + pub fn from_estimate( |
| 119 | + estimated_count_of_items: usize, |
| 120 | + max_false_positive_probability: f64, |
| 121 | + ) -> Self { |
| 122 | + // Check Wikipedia for these formulae |
| 123 | + let optimal_filter_size = (-(estimated_count_of_items as f64) |
| 124 | + * max_false_positive_probability.ln() |
| 125 | + / (2.0_f64.ln().powi(2))) |
| 126 | + .ceil() as usize; |
| 127 | + let optimal_hash_count = ((optimal_filter_size as f64 / estimated_count_of_items as f64) |
| 128 | + * 2.0_f64.ln()) |
| 129 | + .ceil() as usize; |
| 130 | + Self::with_dimensions(optimal_filter_size, optimal_hash_count) |
| 131 | + } |
| 132 | +} |
| 133 | + |
| 134 | +impl<Item: Hash> BloomFilter<Item> for MultiBinaryBloomFilter { |
| 135 | + fn insert(&mut self, item: Item) { |
| 136 | + for builder in &self.hash_builders { |
| 137 | + let mut hasher = builder.build_hasher(); |
| 138 | + item.hash(&mut hasher); |
| 139 | + let hash = hasher.finish(); |
| 140 | + let index = hash % self.filter_size as u64; |
| 141 | + let byte_index = index as usize / 8; // this is this byte that we need to modify |
| 142 | + let bit_index = (index % 8) as u8; // we cannot only OR with value 1 this time, since we have 8 bits |
| 143 | + self.bytes[byte_index] |= 1 << bit_index; |
| 144 | + } |
| 145 | + } |
| 146 | + |
| 147 | + fn contains(&self, item: &Item) -> bool { |
| 148 | + for builder in &self.hash_builders { |
| 149 | + let mut hasher = builder.build_hasher(); |
| 150 | + item.hash(&mut hasher); |
| 151 | + let hash = hasher.finish(); |
| 152 | + let index = hash % self.filter_size as u64; |
| 153 | + let byte_index = index as usize / 8; // this is this byte that we need to modify |
| 154 | + let bit_index = (index % 8) as u8; // we cannot only OR with value 1 this time, since we have 8 bits |
| 155 | + if self.bytes[byte_index] & (1 << bit_index) == 0 { |
| 156 | + return false; |
| 157 | + } |
| 158 | + } |
| 159 | + true |
| 160 | + } |
| 161 | +} |
| 162 | + |
| 163 | +#[cfg(test)] |
| 164 | +mod tests { |
| 165 | + use crate::data_structures::probabilistic::bloom_filter::{ |
| 166 | + BasicBloomFilter, BloomFilter, MultiBinaryBloomFilter, SingleBinaryBloomFilter, |
| 167 | + }; |
| 168 | + use quickcheck::{Arbitrary, Gen}; |
| 169 | + use quickcheck_macros::quickcheck; |
| 170 | + use std::collections::HashSet; |
| 171 | + |
| 172 | + #[derive(Debug, Clone)] |
| 173 | + struct TestSet { |
| 174 | + to_insert: HashSet<i32>, |
| 175 | + to_test: Vec<i32>, |
| 176 | + } |
| 177 | + |
| 178 | + impl Arbitrary for TestSet { |
| 179 | + fn arbitrary(g: &mut Gen) -> Self { |
| 180 | + let mut qty = usize::arbitrary(g) % 5_000; |
| 181 | + if qty < 50 { |
| 182 | + qty += 50; // won't be perfectly uniformly distributed |
| 183 | + } |
| 184 | + let mut to_insert = HashSet::with_capacity(qty); |
| 185 | + let mut to_test = Vec::with_capacity(qty); |
| 186 | + for _ in 0..(qty) { |
| 187 | + to_insert.insert(i32::arbitrary(g)); |
| 188 | + to_test.push(i32::arbitrary(g)); |
| 189 | + } |
| 190 | + TestSet { to_insert, to_test } |
| 191 | + } |
| 192 | + } |
| 193 | + |
| 194 | + #[quickcheck] |
| 195 | + fn basic_filter_must_not_return_false_negative(TestSet { to_insert, to_test }: TestSet) { |
| 196 | + let mut basic_filter = BasicBloomFilter::<10_000>::default(); |
| 197 | + for item in &to_insert { |
| 198 | + basic_filter.insert(*item); |
| 199 | + } |
| 200 | + for other in to_test { |
| 201 | + if !basic_filter.contains(&other) { |
| 202 | + assert!(!to_insert.contains(&other)) |
| 203 | + } |
| 204 | + } |
| 205 | + } |
| 206 | + |
| 207 | + #[quickcheck] |
| 208 | + fn binary_filter_must_not_return_false_negative(TestSet { to_insert, to_test }: TestSet) { |
| 209 | + let mut binary_filter = SingleBinaryBloomFilter::default(); |
| 210 | + for item in &to_insert { |
| 211 | + binary_filter.insert(*item); |
| 212 | + } |
| 213 | + for other in to_test { |
| 214 | + if !binary_filter.contains(&other) { |
| 215 | + assert!(!to_insert.contains(&other)) |
| 216 | + } |
| 217 | + } |
| 218 | + } |
| 219 | + |
| 220 | + #[quickcheck] |
| 221 | + fn a_basic_filter_of_capacity_128_is_the_same_as_a_binary_filter( |
| 222 | + TestSet { to_insert, to_test }: TestSet, |
| 223 | + ) { |
| 224 | + let mut basic_filter = BasicBloomFilter::<128>::default(); // change 32 to anything else here, and the test won't pass |
| 225 | + let mut binary_filter = SingleBinaryBloomFilter::default(); |
| 226 | + for item in &to_insert { |
| 227 | + basic_filter.insert(*item); |
| 228 | + binary_filter.insert(*item); |
| 229 | + } |
| 230 | + for other in to_test { |
| 231 | + // Since we use the same DefaultHasher::new(), and both have size 32, we should have exactly the same results |
| 232 | + assert_eq!( |
| 233 | + basic_filter.contains(&other), |
| 234 | + binary_filter.contains(&other) |
| 235 | + ); |
| 236 | + } |
| 237 | + } |
| 238 | + |
| 239 | + const FALSE_POSITIVE_MAX: f64 = 0.05; |
| 240 | + |
| 241 | + #[quickcheck] |
| 242 | + fn a_multi_binary_bloom_filter_must_not_return_false_negatives( |
| 243 | + TestSet { to_insert, to_test }: TestSet, |
| 244 | + ) { |
| 245 | + let n = to_insert.len(); |
| 246 | + if n == 0 { |
| 247 | + // avoid dividing by 0 when adjusting the size |
| 248 | + return; |
| 249 | + } |
| 250 | + // See Wikipedia for those formula |
| 251 | + let mut binary_filter = MultiBinaryBloomFilter::from_estimate(n, FALSE_POSITIVE_MAX); |
| 252 | + for item in &to_insert { |
| 253 | + binary_filter.insert(*item); |
| 254 | + } |
| 255 | + let tests = to_test.len(); |
| 256 | + let mut false_positives = 0; |
| 257 | + for other in to_test { |
| 258 | + if !binary_filter.contains(&other) { |
| 259 | + assert!(!to_insert.contains(&other)) |
| 260 | + } else if !to_insert.contains(&other) { |
| 261 | + // false positive |
| 262 | + false_positives += 1; |
| 263 | + } |
| 264 | + } |
| 265 | + let fp_rate = false_positives as f64 / tests as f64; |
| 266 | + assert!(fp_rate < 1.0); // This isn't really a test, but so that you have the `fp_rate` variable to print out, or evaluate |
| 267 | + } |
| 268 | +} |
0 commit comments