Skip to content

Commit a01d85a

Browse files
authored
Add Bloom filter (#497)
1 parent 9d03694 commit a01d85a

File tree

3 files changed

+270
-0
lines changed

3 files changed

+270
-0
lines changed

DIRECTORY.md

+1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
* [Union Find](https://github.com/TheAlgorithms/Rust/blob/master/src/data_structures/union_find.rs)
5050
* Probabilistic Data Structures
5151
* [Count-min Sketch](https://github.com/TheAlgorithms/Rust/blob/master/src/data_structures/probabilistic/count_min_sketch.rs)
52+
* [Bloom Filter](https://github.com/TheAlgorithms/Rust/blob/master/src/data_structures/probabilistic/bloom_filter.rs)
5253
* Dynamic Programming
5354
* [Coin Change](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/coin_change.rs)
5455
* [Edit Distance => See Levenshtein Distance](https://github.com/TheAlgorithms/Rust/blob/master/src/string/levenshtein_distance.rs)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
use std::collections::hash_map::{DefaultHasher, RandomState};
2+
use std::hash::{BuildHasher, Hash, Hasher};
3+
4+
/// A Bloom Filter <https://en.wikipedia.org/wiki/Bloom_filter> is a probabilistic data structure testing whether an element belongs to a set or not
5+
/// Therefore, its contract looks very close to the one of a set, for example a `HashSet`
6+
trait BloomFilter<Item: Hash> {
7+
fn insert(&mut self, item: Item);
8+
fn contains(&self, item: &Item) -> bool;
9+
}
10+
11+
/// What is the point of using a Bloom Filter if it acts like a Set?
12+
/// Let's imagine we have a huge number of elements to store (like un unbounded data stream) a Set storing every element will most likely take up too much space, at some point.
13+
/// As other probabilistic data structures like Count-min Sketch, the goal of a Bloom Filter is to trade off exactitude for constant space.
14+
/// We won't have a strictly exact result of whether the value belongs to the set, but we'll use constant space instead
15+
16+
/// Let's start with the basic idea behind the implementation
17+
/// Let's start by trying to make a `HashSet` with constant space:
18+
/// Instead of storing every element and grow the set infinitely, let's use a vector with constant capacity `CAPACITY`
19+
/// Each element of this vector will be a boolean.
20+
/// When a new element is inserted, we hash its value and set the index at index `hash(item) % CAPACITY` to `true`
21+
/// When looking for an item, we hash its value and retrieve the boolean at index `hash(item) % CAPACITY`
22+
/// If it's `false` it's absolutely sure the item isn't present
23+
/// If it's `true` the item may be present, or maybe another one produces the same hash
24+
#[derive(Debug)]
25+
struct BasicBloomFilter<const CAPACITY: usize> {
26+
vec: [bool; CAPACITY],
27+
}
28+
29+
impl<const CAPACITY: usize> Default for BasicBloomFilter<CAPACITY> {
30+
fn default() -> Self {
31+
Self {
32+
vec: [false; CAPACITY],
33+
}
34+
}
35+
}
36+
37+
impl<Item: Hash, const CAPACITY: usize> BloomFilter<Item> for BasicBloomFilter<CAPACITY> {
38+
fn insert(&mut self, item: Item) {
39+
let mut hasher = DefaultHasher::new();
40+
item.hash(&mut hasher);
41+
let idx = (hasher.finish() % CAPACITY as u64) as usize;
42+
self.vec[idx] = true;
43+
}
44+
45+
fn contains(&self, item: &Item) -> bool {
46+
let mut hasher = DefaultHasher::new();
47+
item.hash(&mut hasher);
48+
let idx = (hasher.finish() % CAPACITY as u64) as usize;
49+
self.vec[idx]
50+
}
51+
}
52+
53+
/// Can we improve it? Certainly, in different ways.
54+
/// One pattern you may have identified here is that we use a "binary array" (a vector of binary values)
55+
/// For instance, we might have `[0,1,0,0,1,0]`, which is actually the binary representation of 9
56+
/// This means we can immediately replace our `Vec<bool>` by an actual number
57+
/// What would it mean to set a `1` at index `i`?
58+
/// Imagine a `CAPACITY` of `6`. The initial value for our mask is `000000`.
59+
/// We want to store `"Bloom"`. Its hash modulo `CAPACITY` is `5`. Which means we need to set `1` at the last index.
60+
/// It can be performed by doing `000000 | 000001`
61+
/// Meaning we can hash the item value, use a modulo to find the index, and do a binary `or` between the current number and the index
62+
#[derive(Debug, Default)]
63+
struct SingleBinaryBloomFilter {
64+
fingerprint: u128, // let's use 128 bits, the equivalent of using CAPACITY=128 in the previous example
65+
}
66+
67+
/// Given a value and a hash function, compute the hash and return the bit mask
68+
fn mask_128<T: Hash>(hasher: &mut DefaultHasher, item: T) -> u128 {
69+
item.hash(hasher);
70+
let idx = (hasher.finish() % 128) as u32;
71+
// idx is where we want to put a 1, let's convert this into a proper binary mask
72+
2_u128.pow(idx)
73+
}
74+
75+
impl<T: Hash> BloomFilter<T> for SingleBinaryBloomFilter {
76+
fn insert(&mut self, item: T) {
77+
self.fingerprint |= mask_128(&mut DefaultHasher::new(), &item);
78+
}
79+
80+
fn contains(&self, item: &T) -> bool {
81+
(self.fingerprint & mask_128(&mut DefaultHasher::new(), item)) > 0
82+
}
83+
}
84+
85+
/// We may have made some progress in term of CPU efficiency, using binary operators.
86+
/// But we might still run into a lot of collisions with a single 128-bits number.
87+
/// Can we use greater numbers then? Currently, our implementation is limited to 128 bits.
88+
///
89+
/// Should we go back to using an array, then?
90+
/// We could! But instead of using `Vec<bool>` we could use `Vec<u8>`.
91+
/// Each `u8` can act as a mask as we've done before, and is actually 1 byte in memory (same as a boolean!)
92+
/// That'd allow us to go over 128 bits, but would divide by 8 the memory footprint.
93+
/// That's one thing, and will involve dividing / shifting by 8 in different places.
94+
///
95+
/// But still, can we reduce the collisions furthermore?
96+
///
97+
/// As we did with count-min-sketch, we could use multiple hash function.
98+
/// When inserting a value, we compute its hash with every hash function (`hash_i`) and perform the same operation as above (the OR with `fingerprint`)
99+
/// Then when looking for a value, if **ANY** of the tests (`hash` then `AND`) returns 0 then this means the value is missing from the set, otherwise it would have returned 1
100+
/// If it returns `1`, it **may** be that the item is present, but could also be a collision
101+
/// This is what a Bloom Filter is about: returning `false` means the value is necessarily absent, and returning true means it may be present
102+
pub struct MultiBinaryBloomFilter {
103+
filter_size: usize,
104+
bytes: Vec<u8>,
105+
hash_builders: Vec<RandomState>,
106+
}
107+
108+
impl MultiBinaryBloomFilter {
109+
pub fn with_dimensions(filter_size: usize, hash_count: usize) -> Self {
110+
let bytes_count = filter_size / 8 + if filter_size % 8 > 0 { 1 } else { 0 }; // we need 8 times less entries in the array, since we are using bytes. Careful that we have at least one element though
111+
Self {
112+
filter_size,
113+
bytes: vec![0; bytes_count],
114+
hash_builders: vec![RandomState::new(); hash_count],
115+
}
116+
}
117+
118+
pub fn from_estimate(
119+
estimated_count_of_items: usize,
120+
max_false_positive_probability: f64,
121+
) -> Self {
122+
// Check Wikipedia for these formulae
123+
let optimal_filter_size = (-(estimated_count_of_items as f64)
124+
* max_false_positive_probability.ln()
125+
/ (2.0_f64.ln().powi(2)))
126+
.ceil() as usize;
127+
let optimal_hash_count = ((optimal_filter_size as f64 / estimated_count_of_items as f64)
128+
* 2.0_f64.ln())
129+
.ceil() as usize;
130+
Self::with_dimensions(optimal_filter_size, optimal_hash_count)
131+
}
132+
}
133+
134+
impl<Item: Hash> BloomFilter<Item> for MultiBinaryBloomFilter {
135+
fn insert(&mut self, item: Item) {
136+
for builder in &self.hash_builders {
137+
let mut hasher = builder.build_hasher();
138+
item.hash(&mut hasher);
139+
let hash = hasher.finish();
140+
let index = hash % self.filter_size as u64;
141+
let byte_index = index as usize / 8; // this is this byte that we need to modify
142+
let bit_index = (index % 8) as u8; // we cannot only OR with value 1 this time, since we have 8 bits
143+
self.bytes[byte_index] |= 1 << bit_index;
144+
}
145+
}
146+
147+
fn contains(&self, item: &Item) -> bool {
148+
for builder in &self.hash_builders {
149+
let mut hasher = builder.build_hasher();
150+
item.hash(&mut hasher);
151+
let hash = hasher.finish();
152+
let index = hash % self.filter_size as u64;
153+
let byte_index = index as usize / 8; // this is this byte that we need to modify
154+
let bit_index = (index % 8) as u8; // we cannot only OR with value 1 this time, since we have 8 bits
155+
if self.bytes[byte_index] & (1 << bit_index) == 0 {
156+
return false;
157+
}
158+
}
159+
true
160+
}
161+
}
162+
163+
#[cfg(test)]
164+
mod tests {
165+
use crate::data_structures::probabilistic::bloom_filter::{
166+
BasicBloomFilter, BloomFilter, MultiBinaryBloomFilter, SingleBinaryBloomFilter,
167+
};
168+
use quickcheck::{Arbitrary, Gen};
169+
use quickcheck_macros::quickcheck;
170+
use std::collections::HashSet;
171+
172+
#[derive(Debug, Clone)]
173+
struct TestSet {
174+
to_insert: HashSet<i32>,
175+
to_test: Vec<i32>,
176+
}
177+
178+
impl Arbitrary for TestSet {
179+
fn arbitrary(g: &mut Gen) -> Self {
180+
let mut qty = usize::arbitrary(g) % 5_000;
181+
if qty < 50 {
182+
qty += 50; // won't be perfectly uniformly distributed
183+
}
184+
let mut to_insert = HashSet::with_capacity(qty);
185+
let mut to_test = Vec::with_capacity(qty);
186+
for _ in 0..(qty) {
187+
to_insert.insert(i32::arbitrary(g));
188+
to_test.push(i32::arbitrary(g));
189+
}
190+
TestSet { to_insert, to_test }
191+
}
192+
}
193+
194+
#[quickcheck]
195+
fn basic_filter_must_not_return_false_negative(TestSet { to_insert, to_test }: TestSet) {
196+
let mut basic_filter = BasicBloomFilter::<10_000>::default();
197+
for item in &to_insert {
198+
basic_filter.insert(*item);
199+
}
200+
for other in to_test {
201+
if !basic_filter.contains(&other) {
202+
assert!(!to_insert.contains(&other))
203+
}
204+
}
205+
}
206+
207+
#[quickcheck]
208+
fn binary_filter_must_not_return_false_negative(TestSet { to_insert, to_test }: TestSet) {
209+
let mut binary_filter = SingleBinaryBloomFilter::default();
210+
for item in &to_insert {
211+
binary_filter.insert(*item);
212+
}
213+
for other in to_test {
214+
if !binary_filter.contains(&other) {
215+
assert!(!to_insert.contains(&other))
216+
}
217+
}
218+
}
219+
220+
#[quickcheck]
221+
fn a_basic_filter_of_capacity_128_is_the_same_as_a_binary_filter(
222+
TestSet { to_insert, to_test }: TestSet,
223+
) {
224+
let mut basic_filter = BasicBloomFilter::<128>::default(); // change 32 to anything else here, and the test won't pass
225+
let mut binary_filter = SingleBinaryBloomFilter::default();
226+
for item in &to_insert {
227+
basic_filter.insert(*item);
228+
binary_filter.insert(*item);
229+
}
230+
for other in to_test {
231+
// Since we use the same DefaultHasher::new(), and both have size 32, we should have exactly the same results
232+
assert_eq!(
233+
basic_filter.contains(&other),
234+
binary_filter.contains(&other)
235+
);
236+
}
237+
}
238+
239+
const FALSE_POSITIVE_MAX: f64 = 0.05;
240+
241+
#[quickcheck]
242+
fn a_multi_binary_bloom_filter_must_not_return_false_negatives(
243+
TestSet { to_insert, to_test }: TestSet,
244+
) {
245+
let n = to_insert.len();
246+
if n == 0 {
247+
// avoid dividing by 0 when adjusting the size
248+
return;
249+
}
250+
// See Wikipedia for those formula
251+
let mut binary_filter = MultiBinaryBloomFilter::from_estimate(n, FALSE_POSITIVE_MAX);
252+
for item in &to_insert {
253+
binary_filter.insert(*item);
254+
}
255+
let tests = to_test.len();
256+
let mut false_positives = 0;
257+
for other in to_test {
258+
if !binary_filter.contains(&other) {
259+
assert!(!to_insert.contains(&other))
260+
} else if !to_insert.contains(&other) {
261+
// false positive
262+
false_positives += 1;
263+
}
264+
}
265+
let fp_rate = false_positives as f64 / tests as f64;
266+
assert!(fp_rate < 1.0); // This isn't really a test, but so that you have the `fp_rate` variable to print out, or evaluate
267+
}
268+
}
+1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1+
pub mod bloom_filter;
12
pub mod count_min_sketch;

0 commit comments

Comments
 (0)