|
| 1 | +use std::collections::hash_map::RandomState; |
| 2 | +use std::fmt::{Debug, Formatter}; |
| 3 | +use std::hash::{BuildHasher, Hash, Hasher}; |
| 4 | + |
| 5 | +/// A probabilistic data structure holding an approximate count for diverse items efficiently (using constant space) |
| 6 | +/// |
| 7 | +/// Let's imagine we want to count items from an incoming (unbounded) data stream |
| 8 | +/// One way to do this would be to hold a frequency hashmap, counting element hashes |
| 9 | +/// This works extremely well, but unfortunately would require a lot of memory if we have a huge diversity of incoming items in the data stream |
| 10 | +/// |
| 11 | +/// CountMinSketch aims at solving this problem, trading off the exact count for an approximate one, but getting from potentially unbounded space complexity to constant complexity |
| 12 | +/// See the implementation below for more details |
| 13 | +/// |
| 14 | +/// Here is the definition of the different allowed operations on a CountMinSketch: |
| 15 | +/// * increment the count of an item |
| 16 | +/// * retrieve the count of an item |
| 17 | +pub trait CountMinSketch { |
| 18 | + type Item; |
| 19 | + |
| 20 | + fn increment(&mut self, item: Self::Item); |
| 21 | + fn increment_by(&mut self, item: Self::Item, count: usize); |
| 22 | + fn get_count(&self, item: Self::Item) -> usize; |
| 23 | +} |
| 24 | + |
| 25 | +/// The common implementation of a CountMinSketch |
| 26 | +/// Holding a DEPTH x WIDTH matrix of counts |
| 27 | +/// |
| 28 | +/// The idea behind the implementation is the following: |
| 29 | +/// Let's start from our problem statement above. We have a frequency map of counts, and want to go reduce its space complexity |
| 30 | +/// The immediate way to do this would be to use a Vector with a fixed size, let this size be `WIDTH` |
| 31 | +/// We will be holding the count of each item `item` in the Vector, at index `i = hash(item) % WIDTH` where `hash` is a hash function: `item -> usize` |
| 32 | +/// We now have constant space. |
| 33 | +/// |
| 34 | +/// The problem though is that we'll potentially run into a lot of collisions. |
| 35 | +/// Taking an extreme example, if `WIDTH = 1`, all items will have the same count, which is the sum of counts of every items |
| 36 | +/// We could reduce the amount of collisions by using a bigger `WIDTH` but this wouldn't be way more efficient than the "big" frequency map |
| 37 | +/// How do we improve the solution, but still keeping constant space? |
| 38 | +/// |
| 39 | +/// The idea is to use not just one vector, but multiple (`DEPTH`) ones and attach different `hash` functions to each vector |
| 40 | +/// This would lead to the following data structure: |
| 41 | +/// <- WIDTH = 5 -> |
| 42 | +/// D hash1: [0, 0, 0, 0, 0] |
| 43 | +/// E hash2: [0, 0, 0, 0, 0] |
| 44 | +/// P hash3: [0, 0, 0, 0, 0] |
| 45 | +/// T hash4: [0, 0, 0, 0, 0] |
| 46 | +/// H hash5: [0, 0, 0, 0, 0] |
| 47 | +/// = hash6: [0, 0, 0, 0, 0] |
| 48 | +/// 7 hash7: [0, 0, 0, 0, 0] |
| 49 | +/// Every hash function must return a different value for the same item. |
| 50 | +/// Let's say we hash "TEST" and: |
| 51 | +/// hash1("TEST") = 42 => idx = 2 |
| 52 | +/// hash2("TEST") = 26 => idx = 1 |
| 53 | +/// hash3("TEST") = 10 => idx = 0 |
| 54 | +/// hash4("TEST") = 33 => idx = 3 |
| 55 | +/// hash5("TEST") = 54 => idx = 4 |
| 56 | +/// hash6("TEST") = 11 => idx = 1 |
| 57 | +/// hash7("TEST") = 50 => idx = 0 |
| 58 | +/// This would lead our structure to become: |
| 59 | +/// <- WIDTH = 5 -> |
| 60 | +/// D hash1: [0, 0, 1, 0, 0] |
| 61 | +/// E hash2: [0, 1, 0, 0, 0] |
| 62 | +/// P hash3: [1, 0, 0, 0, 0] |
| 63 | +/// T hash4: [0, 0, 0, 1, 0] |
| 64 | +/// H hash5: [0, 0, 0, 0, 1] |
| 65 | +/// = hash6: [0, 1, 0, 0, 0] |
| 66 | +/// 7 hash7: [1, 0, 0, 0, 0] |
| 67 | +/// |
| 68 | +/// Now say we hash "OTHER" and: |
| 69 | +/// hash1("OTHER") = 23 => idx = 3 |
| 70 | +/// hash2("OTHER") = 11 => idx = 1 |
| 71 | +/// hash3("OTHER") = 52 => idx = 2 |
| 72 | +/// hash4("OTHER") = 25 => idx = 0 |
| 73 | +/// hash5("OTHER") = 31 => idx = 1 |
| 74 | +/// hash6("OTHER") = 24 => idx = 4 |
| 75 | +/// hash7("OTHER") = 30 => idx = 0 |
| 76 | +/// Leading our data structure to become: |
| 77 | +/// <- WIDTH = 5 -> |
| 78 | +/// D hash1: [0, 0, 1, 1, 0] |
| 79 | +/// E hash2: [0, 2, 0, 0, 0] |
| 80 | +/// P hash3: [1, 0, 1, 0, 0] |
| 81 | +/// T hash4: [1, 0, 0, 1, 0] |
| 82 | +/// H hash5: [0, 1, 0, 0, 1] |
| 83 | +/// = hash6: [0, 1, 0, 0, 1] |
| 84 | +/// 7 hash7: [2, 0, 0, 0, 0] |
| 85 | +/// |
| 86 | +/// We actually can witness some collisions (invalid counts of `2` above in some rows). |
| 87 | +/// This means that if we have to return the count for "TEST", we'd actually fetch counts from every row and return the minimum value |
| 88 | +/// |
| 89 | +/// This could potentially be overestimated if we have a huge number of entries and a lot of collisions. |
| 90 | +/// But an interesting property is that the count we return for "TEST" cannot be underestimated |
| 91 | +pub struct HashCountMinSketch<Item: Hash, const WIDTH: usize, const DEPTH: usize> { |
| 92 | + phantom: std::marker::PhantomData<Item>, // just a marker for Item to be used |
| 93 | + counts: [[usize; WIDTH]; DEPTH], |
| 94 | + hashers: [RandomState; DEPTH], |
| 95 | +} |
| 96 | + |
| 97 | +impl<Item: Hash, const WIDTH: usize, const DEPTH: usize> Debug |
| 98 | + for HashCountMinSketch<Item, WIDTH, DEPTH> |
| 99 | +{ |
| 100 | + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { |
| 101 | + f.debug_struct("Item").field("vecs", &self.counts).finish() |
| 102 | + } |
| 103 | +} |
| 104 | + |
| 105 | +impl<T: Hash, const WIDTH: usize, const DEPTH: usize> Default |
| 106 | + for HashCountMinSketch<T, WIDTH, DEPTH> |
| 107 | +{ |
| 108 | + fn default() -> Self { |
| 109 | + let hashers = std::array::from_fn(|_| RandomState::new()); |
| 110 | + |
| 111 | + Self { |
| 112 | + phantom: Default::default(), |
| 113 | + counts: [[0; WIDTH]; DEPTH], |
| 114 | + hashers, |
| 115 | + } |
| 116 | + } |
| 117 | +} |
| 118 | + |
| 119 | +impl<Item: Hash, const WIDTH: usize, const DEPTH: usize> CountMinSketch |
| 120 | + for HashCountMinSketch<Item, WIDTH, DEPTH> |
| 121 | +{ |
| 122 | + type Item = Item; |
| 123 | + |
| 124 | + fn increment(&mut self, item: Self::Item) { |
| 125 | + self.increment_by(item, 1) |
| 126 | + } |
| 127 | + |
| 128 | + fn increment_by(&mut self, item: Self::Item, count: usize) { |
| 129 | + for (row, r) in self.hashers.iter_mut().enumerate() { |
| 130 | + let mut h = r.build_hasher(); |
| 131 | + item.hash(&mut h); |
| 132 | + let hashed = h.finish(); |
| 133 | + let col = (hashed % WIDTH as u64) as usize; |
| 134 | + self.counts[row][col] += count; |
| 135 | + } |
| 136 | + } |
| 137 | + |
| 138 | + fn get_count(&self, item: Self::Item) -> usize { |
| 139 | + self.hashers |
| 140 | + .iter() |
| 141 | + .enumerate() |
| 142 | + .map(|(row, r)| { |
| 143 | + let mut h = r.build_hasher(); |
| 144 | + item.hash(&mut h); |
| 145 | + let hashed = h.finish(); |
| 146 | + let col = (hashed % WIDTH as u64) as usize; |
| 147 | + self.counts[row][col] |
| 148 | + }) |
| 149 | + .min() |
| 150 | + .unwrap() |
| 151 | + } |
| 152 | +} |
| 153 | + |
| 154 | +#[cfg(test)] |
| 155 | +mod tests { |
| 156 | + use crate::data_structures::probabilistic::count_min_sketch::{ |
| 157 | + CountMinSketch, HashCountMinSketch, |
| 158 | + }; |
| 159 | + use quickcheck::{Arbitrary, Gen}; |
| 160 | + use std::collections::HashSet; |
| 161 | + |
| 162 | + #[test] |
| 163 | + fn hash_functions_should_hash_differently() { |
| 164 | + let mut sketch: HashCountMinSketch<&str, 50, 50> = HashCountMinSketch::default(); // use a big DEPTH |
| 165 | + sketch.increment("something"); |
| 166 | + // We want to check that our hash functions actually produce different results, so we'll store the indices where we encounter a count=1 in a set |
| 167 | + let mut indices_of_ones: HashSet<usize> = HashSet::default(); |
| 168 | + for counts in sketch.counts { |
| 169 | + let ones = counts |
| 170 | + .into_iter() |
| 171 | + .enumerate() |
| 172 | + .filter_map(|(idx, count)| (count == 1).then_some(idx)) |
| 173 | + .collect::<Vec<_>>(); |
| 174 | + assert_eq!(1, ones.len()); |
| 175 | + indices_of_ones.insert(ones[0]); |
| 176 | + } |
| 177 | + // Given the parameters (WIDTH = 50, DEPTH = 50) it's extremely unlikely that all hash functions hash to the same index |
| 178 | + assert!(indices_of_ones.len() > 1); // but we want to avoid a bug where all hash functions would produce the same hash (or hash to the same index) |
| 179 | + } |
| 180 | + |
| 181 | + #[test] |
| 182 | + fn inspect_counts() { |
| 183 | + let mut sketch: HashCountMinSketch<&str, 5, 7> = HashCountMinSketch::default(); |
| 184 | + sketch.increment("test"); |
| 185 | + // Inspect internal state: |
| 186 | + for counts in sketch.counts { |
| 187 | + let zeroes = counts.iter().filter(|count| **count == 0).count(); |
| 188 | + assert_eq!(4, zeroes); |
| 189 | + let ones = counts.iter().filter(|count| **count == 1).count(); |
| 190 | + assert_eq!(1, ones); |
| 191 | + } |
| 192 | + sketch.increment("test"); |
| 193 | + for counts in sketch.counts { |
| 194 | + let zeroes = counts.iter().filter(|count| **count == 0).count(); |
| 195 | + assert_eq!(4, zeroes); |
| 196 | + let twos = counts.iter().filter(|count| **count == 2).count(); |
| 197 | + assert_eq!(1, twos); |
| 198 | + } |
| 199 | + |
| 200 | + // This one is actually deterministic |
| 201 | + assert_eq!(2, sketch.get_count("test")); |
| 202 | + } |
| 203 | + |
| 204 | + #[derive(Debug, Clone, Eq, PartialEq, Hash)] |
| 205 | + struct TestItem { |
| 206 | + item: String, |
| 207 | + count: usize, |
| 208 | + } |
| 209 | + |
| 210 | + const MAX_STR_LEN: u8 = 30; |
| 211 | + const MAX_COUNT: usize = 20; |
| 212 | + |
| 213 | + impl Arbitrary for TestItem { |
| 214 | + fn arbitrary(g: &mut Gen) -> Self { |
| 215 | + let str_len = u8::arbitrary(g) % MAX_STR_LEN; |
| 216 | + let mut str = String::with_capacity(str_len as usize); |
| 217 | + for _ in 0..str_len { |
| 218 | + str.push(char::arbitrary(g)); |
| 219 | + } |
| 220 | + let count = usize::arbitrary(g) % MAX_COUNT; |
| 221 | + TestItem { item: str, count } |
| 222 | + } |
| 223 | + } |
| 224 | + |
| 225 | + #[quickcheck_macros::quickcheck] |
| 226 | + fn must_not_understimate_count(test_items: Vec<TestItem>) { |
| 227 | + let test_items = test_items.into_iter().collect::<HashSet<_>>(); // remove duplicated (would lead to weird counts) |
| 228 | + let n = test_items.len(); |
| 229 | + let mut sketch: HashCountMinSketch<String, 50, 10> = HashCountMinSketch::default(); |
| 230 | + let mut exact_count = 0; |
| 231 | + for TestItem { item, count } in &test_items { |
| 232 | + sketch.increment_by(item.clone(), *count); |
| 233 | + } |
| 234 | + for TestItem { item, count } in test_items { |
| 235 | + let stored_count = sketch.get_count(item); |
| 236 | + assert!(stored_count >= count); |
| 237 | + if count == stored_count { |
| 238 | + exact_count += 1; |
| 239 | + } |
| 240 | + } |
| 241 | + if n > 20 { |
| 242 | + // if n is too short, the stat isn't really relevant |
| 243 | + let exact_ratio = exact_count as f64 / n as f64; |
| 244 | + assert!(exact_ratio > 0.7); // the proof is quite hard, but this should be OK |
| 245 | + } |
| 246 | + } |
| 247 | +} |
0 commit comments