Skip to content

Commit 44fb98b

Browse files
authored
Add Count-Min Sketch (#485)
1 parent e01ace3 commit 44fb98b

File tree

5 files changed

+255
-0
lines changed

5 files changed

+255
-0
lines changed

Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ lazy_static = "1.4.0"
99
num-bigint = { version = "0.4", optional = true }
1010
num-traits = { version = "0.2", optional = true }
1111

12+
[dev-dependencies]
13+
quickcheck = "1.0"
14+
quickcheck_macros = "1.0"
15+
1216
[features]
1317
default = ["big-math"]
1418
big-math = ["dep:num-bigint", "dep:num-traits"]

DIRECTORY.md

+2
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@
4747
* [Treap](https://github.com/TheAlgorithms/Rust/blob/master/src/data_structures/treap.rs)
4848
* [Trie](https://github.com/TheAlgorithms/Rust/blob/master/src/data_structures/trie.rs)
4949
* [Union Find](https://github.com/TheAlgorithms/Rust/blob/master/src/data_structures/union_find.rs)
50+
* Probabilistic Data Structures
51+
* [Count-min Sketch](https://github.com/TheAlgorithms/Rust/blob/master/src/data_structures/probabilistic/count_min_sketch.rs)
5052
* Dynamic Programming
5153
* [Coin Change](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/coin_change.rs)
5254
* [Edit Distance => See Levenshtein Distance](https://github.com/TheAlgorithms/Rust/blob/master/src/string/levenshtein_distance.rs)

src/data_structures/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ mod fenwick_tree;
55
mod graph;
66
mod heap;
77
mod linked_list;
8+
pub mod probabilistic;
89
mod queue;
910
mod rb_tree;
1011
mod segment_tree;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
use std::collections::hash_map::RandomState;
2+
use std::fmt::{Debug, Formatter};
3+
use std::hash::{BuildHasher, Hash, Hasher};
4+
5+
/// A probabilistic data structure holding an approximate count for diverse items efficiently (using constant space)
6+
///
7+
/// Let's imagine we want to count items from an incoming (unbounded) data stream
8+
/// One way to do this would be to hold a frequency hashmap, counting element hashes
9+
/// This works extremely well, but unfortunately would require a lot of memory if we have a huge diversity of incoming items in the data stream
10+
///
11+
/// CountMinSketch aims at solving this problem, trading off the exact count for an approximate one, but getting from potentially unbounded space complexity to constant complexity
12+
/// See the implementation below for more details
13+
///
14+
/// Here is the definition of the different allowed operations on a CountMinSketch:
15+
/// * increment the count of an item
16+
/// * retrieve the count of an item
17+
pub trait CountMinSketch {
18+
type Item;
19+
20+
fn increment(&mut self, item: Self::Item);
21+
fn increment_by(&mut self, item: Self::Item, count: usize);
22+
fn get_count(&self, item: Self::Item) -> usize;
23+
}
24+
25+
/// The common implementation of a CountMinSketch
26+
/// Holding a DEPTH x WIDTH matrix of counts
27+
///
28+
/// The idea behind the implementation is the following:
29+
/// Let's start from our problem statement above. We have a frequency map of counts, and want to go reduce its space complexity
30+
/// The immediate way to do this would be to use a Vector with a fixed size, let this size be `WIDTH`
31+
/// We will be holding the count of each item `item` in the Vector, at index `i = hash(item) % WIDTH` where `hash` is a hash function: `item -> usize`
32+
/// We now have constant space.
33+
///
34+
/// The problem though is that we'll potentially run into a lot of collisions.
35+
/// Taking an extreme example, if `WIDTH = 1`, all items will have the same count, which is the sum of counts of every items
36+
/// We could reduce the amount of collisions by using a bigger `WIDTH` but this wouldn't be way more efficient than the "big" frequency map
37+
/// How do we improve the solution, but still keeping constant space?
38+
///
39+
/// The idea is to use not just one vector, but multiple (`DEPTH`) ones and attach different `hash` functions to each vector
40+
/// This would lead to the following data structure:
41+
/// <- WIDTH = 5 ->
42+
/// D hash1: [0, 0, 0, 0, 0]
43+
/// E hash2: [0, 0, 0, 0, 0]
44+
/// P hash3: [0, 0, 0, 0, 0]
45+
/// T hash4: [0, 0, 0, 0, 0]
46+
/// H hash5: [0, 0, 0, 0, 0]
47+
/// = hash6: [0, 0, 0, 0, 0]
48+
/// 7 hash7: [0, 0, 0, 0, 0]
49+
/// Every hash function must return a different value for the same item.
50+
/// Let's say we hash "TEST" and:
51+
/// hash1("TEST") = 42 => idx = 2
52+
/// hash2("TEST") = 26 => idx = 1
53+
/// hash3("TEST") = 10 => idx = 0
54+
/// hash4("TEST") = 33 => idx = 3
55+
/// hash5("TEST") = 54 => idx = 4
56+
/// hash6("TEST") = 11 => idx = 1
57+
/// hash7("TEST") = 50 => idx = 0
58+
/// This would lead our structure to become:
59+
/// <- WIDTH = 5 ->
60+
/// D hash1: [0, 0, 1, 0, 0]
61+
/// E hash2: [0, 1, 0, 0, 0]
62+
/// P hash3: [1, 0, 0, 0, 0]
63+
/// T hash4: [0, 0, 0, 1, 0]
64+
/// H hash5: [0, 0, 0, 0, 1]
65+
/// = hash6: [0, 1, 0, 0, 0]
66+
/// 7 hash7: [1, 0, 0, 0, 0]
67+
///
68+
/// Now say we hash "OTHER" and:
69+
/// hash1("OTHER") = 23 => idx = 3
70+
/// hash2("OTHER") = 11 => idx = 1
71+
/// hash3("OTHER") = 52 => idx = 2
72+
/// hash4("OTHER") = 25 => idx = 0
73+
/// hash5("OTHER") = 31 => idx = 1
74+
/// hash6("OTHER") = 24 => idx = 4
75+
/// hash7("OTHER") = 30 => idx = 0
76+
/// Leading our data structure to become:
77+
/// <- WIDTH = 5 ->
78+
/// D hash1: [0, 0, 1, 1, 0]
79+
/// E hash2: [0, 2, 0, 0, 0]
80+
/// P hash3: [1, 0, 1, 0, 0]
81+
/// T hash4: [1, 0, 0, 1, 0]
82+
/// H hash5: [0, 1, 0, 0, 1]
83+
/// = hash6: [0, 1, 0, 0, 1]
84+
/// 7 hash7: [2, 0, 0, 0, 0]
85+
///
86+
/// We actually can witness some collisions (invalid counts of `2` above in some rows).
87+
/// This means that if we have to return the count for "TEST", we'd actually fetch counts from every row and return the minimum value
88+
///
89+
/// This could potentially be overestimated if we have a huge number of entries and a lot of collisions.
90+
/// But an interesting property is that the count we return for "TEST" cannot be underestimated
91+
pub struct HashCountMinSketch<Item: Hash, const WIDTH: usize, const DEPTH: usize> {
92+
phantom: std::marker::PhantomData<Item>, // just a marker for Item to be used
93+
counts: [[usize; WIDTH]; DEPTH],
94+
hashers: [RandomState; DEPTH],
95+
}
96+
97+
impl<Item: Hash, const WIDTH: usize, const DEPTH: usize> Debug
98+
for HashCountMinSketch<Item, WIDTH, DEPTH>
99+
{
100+
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
101+
f.debug_struct("Item").field("vecs", &self.counts).finish()
102+
}
103+
}
104+
105+
impl<T: Hash, const WIDTH: usize, const DEPTH: usize> Default
106+
for HashCountMinSketch<T, WIDTH, DEPTH>
107+
{
108+
fn default() -> Self {
109+
let hashers = std::array::from_fn(|_| RandomState::new());
110+
111+
Self {
112+
phantom: Default::default(),
113+
counts: [[0; WIDTH]; DEPTH],
114+
hashers,
115+
}
116+
}
117+
}
118+
119+
impl<Item: Hash, const WIDTH: usize, const DEPTH: usize> CountMinSketch
120+
for HashCountMinSketch<Item, WIDTH, DEPTH>
121+
{
122+
type Item = Item;
123+
124+
fn increment(&mut self, item: Self::Item) {
125+
self.increment_by(item, 1)
126+
}
127+
128+
fn increment_by(&mut self, item: Self::Item, count: usize) {
129+
for (row, r) in self.hashers.iter_mut().enumerate() {
130+
let mut h = r.build_hasher();
131+
item.hash(&mut h);
132+
let hashed = h.finish();
133+
let col = (hashed % WIDTH as u64) as usize;
134+
self.counts[row][col] += count;
135+
}
136+
}
137+
138+
fn get_count(&self, item: Self::Item) -> usize {
139+
self.hashers
140+
.iter()
141+
.enumerate()
142+
.map(|(row, r)| {
143+
let mut h = r.build_hasher();
144+
item.hash(&mut h);
145+
let hashed = h.finish();
146+
let col = (hashed % WIDTH as u64) as usize;
147+
self.counts[row][col]
148+
})
149+
.min()
150+
.unwrap()
151+
}
152+
}
153+
154+
#[cfg(test)]
155+
mod tests {
156+
use crate::data_structures::probabilistic::count_min_sketch::{
157+
CountMinSketch, HashCountMinSketch,
158+
};
159+
use quickcheck::{Arbitrary, Gen};
160+
use std::collections::HashSet;
161+
162+
#[test]
163+
fn hash_functions_should_hash_differently() {
164+
let mut sketch: HashCountMinSketch<&str, 50, 50> = HashCountMinSketch::default(); // use a big DEPTH
165+
sketch.increment("something");
166+
// We want to check that our hash functions actually produce different results, so we'll store the indices where we encounter a count=1 in a set
167+
let mut indices_of_ones: HashSet<usize> = HashSet::default();
168+
for counts in sketch.counts {
169+
let ones = counts
170+
.into_iter()
171+
.enumerate()
172+
.filter_map(|(idx, count)| (count == 1).then_some(idx))
173+
.collect::<Vec<_>>();
174+
assert_eq!(1, ones.len());
175+
indices_of_ones.insert(ones[0]);
176+
}
177+
// Given the parameters (WIDTH = 50, DEPTH = 50) it's extremely unlikely that all hash functions hash to the same index
178+
assert!(indices_of_ones.len() > 1); // but we want to avoid a bug where all hash functions would produce the same hash (or hash to the same index)
179+
}
180+
181+
#[test]
182+
fn inspect_counts() {
183+
let mut sketch: HashCountMinSketch<&str, 5, 7> = HashCountMinSketch::default();
184+
sketch.increment("test");
185+
// Inspect internal state:
186+
for counts in sketch.counts {
187+
let zeroes = counts.iter().filter(|count| **count == 0).count();
188+
assert_eq!(4, zeroes);
189+
let ones = counts.iter().filter(|count| **count == 1).count();
190+
assert_eq!(1, ones);
191+
}
192+
sketch.increment("test");
193+
for counts in sketch.counts {
194+
let zeroes = counts.iter().filter(|count| **count == 0).count();
195+
assert_eq!(4, zeroes);
196+
let twos = counts.iter().filter(|count| **count == 2).count();
197+
assert_eq!(1, twos);
198+
}
199+
200+
// This one is actually deterministic
201+
assert_eq!(2, sketch.get_count("test"));
202+
}
203+
204+
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
205+
struct TestItem {
206+
item: String,
207+
count: usize,
208+
}
209+
210+
const MAX_STR_LEN: u8 = 30;
211+
const MAX_COUNT: usize = 20;
212+
213+
impl Arbitrary for TestItem {
214+
fn arbitrary(g: &mut Gen) -> Self {
215+
let str_len = u8::arbitrary(g) % MAX_STR_LEN;
216+
let mut str = String::with_capacity(str_len as usize);
217+
for _ in 0..str_len {
218+
str.push(char::arbitrary(g));
219+
}
220+
let count = usize::arbitrary(g) % MAX_COUNT;
221+
TestItem { item: str, count }
222+
}
223+
}
224+
225+
#[quickcheck_macros::quickcheck]
226+
fn must_not_understimate_count(test_items: Vec<TestItem>) {
227+
let test_items = test_items.into_iter().collect::<HashSet<_>>(); // remove duplicated (would lead to weird counts)
228+
let n = test_items.len();
229+
let mut sketch: HashCountMinSketch<String, 50, 10> = HashCountMinSketch::default();
230+
let mut exact_count = 0;
231+
for TestItem { item, count } in &test_items {
232+
sketch.increment_by(item.clone(), *count);
233+
}
234+
for TestItem { item, count } in test_items {
235+
let stored_count = sketch.get_count(item);
236+
assert!(stored_count >= count);
237+
if count == stored_count {
238+
exact_count += 1;
239+
}
240+
}
241+
if n > 20 {
242+
// if n is too short, the stat isn't really relevant
243+
let exact_ratio = exact_count as f64 / n as f64;
244+
assert!(exact_ratio > 0.7); // the proof is quite hard, but this should be OK
245+
}
246+
}
247+
}
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pub mod count_min_sketch;

0 commit comments

Comments
 (0)