Skip to content

Commit e64eb8c

Browse files
author
Hendrik Muhs
authored
[ML] Frequent Items: use a bitset for deduplication (elastic#88943)
Speedup frequent_items by using bitsets instead of lists of longs. With this item sets can be faster de-duplicated. A bit is set according to the order of top items (by count).
1 parent 87ab933 commit e64eb8c

File tree

10 files changed

+996
-367
lines changed

10 files changed

+996
-367
lines changed

docs/changelog/88943.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 88943
2+
summary: "Frequent Items: use a bitset for deduplication"
3+
area: Machine Learning
4+
type: enhancement
5+
issues: []

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/frequentitemsets/CountingItemSetTraverser.java

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77

88
package org.elasticsearch.xpack.ml.aggs.frequentitemsets;
99

10+
import org.apache.logging.log4j.LogManager;
11+
import org.apache.logging.log4j.Logger;
1012
import org.apache.lucene.util.BitSet;
1113
import org.apache.lucene.util.FixedBitSet;
12-
import org.apache.lucene.util.LongsRef;
1314
import org.elasticsearch.core.Releasable;
1415
import org.elasticsearch.core.Releasables;
16+
import org.elasticsearch.xpack.ml.aggs.frequentitemsets.TransactionStore.TopItemIds;
1517

1618
import java.io.IOException;
1719
import java.util.Arrays;
@@ -30,6 +32,7 @@
3032
* if [a, b] is not in T, [a, b, c] can not be in T either
3133
*/
3234
class CountingItemSetTraverser implements Releasable {
35+
private static final Logger logger = LogManager.getLogger(CountingItemSetTraverser.class);
3336

3437
// start size and size increment for the occurences stack
3538
private static final int OCCURENCES_SIZE_INCREMENT = 10;
@@ -48,13 +51,19 @@ class CountingItemSetTraverser implements Releasable {
4851
// growable bit set from java util
4952
private java.util.BitSet visited;
5053

51-
CountingItemSetTraverser(TransactionStore transactionStore, int cacheTraversalDepth, int cacheNumberOfTransactions, long minCount) {
54+
CountingItemSetTraverser(
55+
TransactionStore transactionStore,
56+
TopItemIds topItemIds,
57+
int cacheTraversalDepth,
58+
int cacheNumberOfTransactions,
59+
long minCount
60+
) {
5261
this.transactionStore = transactionStore;
5362

5463
boolean success = false;
5564
try {
5665
// we allocate 2 big arrays, if the 2nd allocation fails, ensure we clean up
57-
this.topItemSetTraverser = transactionStore.getTopItemIdTraverser();
66+
this.topItemSetTraverser = new ItemSetTraverser(topItemIds);
5867
this.topTransactionIds = transactionStore.getTopTransactionIds();
5968
success = true;
6069
} finally {
@@ -80,11 +89,15 @@ public boolean next(long earlyStopMinCount) throws IOException {
8089
final long totalTransactionCount = transactionStore.getTotalTransactionCount();
8190

8291
int depth = topItemSetTraverser.getNumberOfItems();
92+
long occurencesOfSingleItem = transactionStore.getItemCount(topItemSetTraverser.getItemId());
93+
8394
if (depth == 1) {
8495
// at the 1st level, we can take the count directly from the transaction store
85-
occurencesStack[0] = transactionStore.getItemCount(topItemSetTraverser.getItemId());
96+
occurencesStack[0] = occurencesOfSingleItem;
97+
return true;
98+
} else if (occurencesOfSingleItem < earlyStopMinCount) {
99+
rememberCountInStack(depth, occurencesOfSingleItem);
86100
return true;
87-
88101
// till a certain depth store results in a cache matrix
89102
} else if (depth < cacheTraversalDepth) {
90103
// get the cached skip count
@@ -187,7 +200,7 @@ public long getCount() {
187200
/**
188201
* Get the count of the item set without the last item
189202
*/
190-
public long getPreviousCount() {
203+
public long getParentCount() {
191204
if (topItemSetTraverser.getNumberOfItems() > 1) {
192205
return occurencesStack[topItemSetTraverser.getNumberOfItems() - 2];
193206
}
@@ -201,7 +214,7 @@ public boolean hasBeenVisited() {
201214
return true;
202215
}
203216

204-
public boolean hasPredecessorBeenVisited() {
217+
public boolean hasParentBeenVisited() {
205218
if (topItemSetTraverser.getNumberOfItems() > 1) {
206219
return visited.get(topItemSetTraverser.getNumberOfItems() - 2);
207220
}
@@ -214,7 +227,7 @@ public void setVisited() {
214227
}
215228
}
216229

217-
public void setPredecessorVisited() {
230+
public void setParentVisited() {
218231
if (topItemSetTraverser.getNumberOfItems() > 1) {
219232
visited.set(topItemSetTraverser.getNumberOfItems() - 2);
220233
}
@@ -228,10 +241,15 @@ public int getNumberOfItems() {
228241
}
229242

230243
/**
231-
* Get the current item set
244+
*
245+
* Get a bitset representation of the current item set
232246
*/
233-
public LongsRef getItemSet() {
234-
return topItemSetTraverser.getItemSet();
247+
public ItemSetBitSet getItemSetBitSet() {
248+
return topItemSetTraverser.getItemSetBitSet();
249+
}
250+
251+
public ItemSetBitSet getParentItemSetBitSet() {
252+
return topItemSetTraverser.getParentItemSetBitSet();
235253
}
236254

237255
/**
@@ -250,7 +268,7 @@ public boolean atLeaf() {
250268

251269
@Override
252270
public void close() {
253-
Releasables.close(topItemSetTraverser, topTransactionIds);
271+
Releasables.close(topTransactionIds);
254272
}
255273

256274
// remember the count in the stack without tracking push and pop

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/frequentitemsets/EclatMapReducer.java

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
import org.apache.logging.log4j.LogManager;
1111
import org.apache.logging.log4j.Logger;
12-
import org.apache.lucene.util.LongsRef;
1312
import org.elasticsearch.common.io.stream.StreamInput;
1413
import org.elasticsearch.common.io.stream.StreamOutput;
1514
import org.elasticsearch.common.io.stream.Writeable;
@@ -25,6 +24,7 @@
2524
import org.elasticsearch.xcontent.ToXContent;
2625
import org.elasticsearch.xcontent.XContentBuilder;
2726
import org.elasticsearch.xpack.ml.aggs.frequentitemsets.FrequentItemSetCollector.FrequentItemSet;
27+
import org.elasticsearch.xpack.ml.aggs.frequentitemsets.TransactionStore.TopItemIds;
2828
import org.elasticsearch.xpack.ml.aggs.frequentitemsets.mr.AbstractItemSetMapReducer;
2929
import org.elasticsearch.xpack.ml.aggs.frequentitemsets.mr.ItemSetMapReduceValueSource.Field;
3030

@@ -338,17 +338,17 @@ private static EclatResult eclat(
338338
final long totalTransactionCount = transactionStore.getTotalTransactionCount();
339339
Map<String, Object> profilingInfo = null;
340340
long minCount = (long) Math.ceil(totalTransactionCount * minimumSupport);
341-
FrequentItemSetCollector collector = new FrequentItemSetCollector(transactionStore, size, minCount);
342-
long numberOfSetsChecked = 0;
343341

344342
if (profilingInfoReduce != null) {
345343
profilingInfo = new LinkedHashMap<>(profilingInfoReduce);
346344
profilingInfo.put("start_min_count_eclat", minCount);
347345
}
348346

349347
try (
348+
TopItemIds topItemIds = transactionStore.getTopItemIds();
350349
CountingItemSetTraverser setTraverser = new CountingItemSetTraverser(
351350
transactionStore,
351+
topItemIds,
352352
BITSET_CACHE_TRAVERSAL_DEPTH,
353353
(int) Math.min(MAX_BITSET_CACHE_NUMBER_OF_TRANSACTIONS, totalTransactionCount),
354354
minCount
@@ -360,7 +360,8 @@ private static EclatResult eclat(
360360
minCount,
361361
transactionStore.getTotalItemCount()
362362
);
363-
363+
FrequentItemSetCollector collector = new FrequentItemSetCollector(transactionStore, topItemIds, size, minCount);
364+
long numberOfSetsChecked = 0;
364365
long previousMinCount = 0;
365366

366367
while (setTraverser.next(minCount)) {
@@ -402,8 +403,11 @@ private static EclatResult eclat(
402403
if (setTraverser.atLeaf()
403404
&& setTraverser.hasBeenVisited() == false
404405
&& setTraverser.getCount() >= minCount
405-
&& setTraverser.getItemSet().length >= minimumSetSize) {
406-
minCount = collector.add(setTraverser.getItemSet(), setTraverser.getCount());
406+
&& setTraverser.getItemSetBitSet().cardinality() >= minimumSetSize) {
407+
408+
logger.trace("add after prune");
409+
410+
minCount = collector.add(setTraverser.getItemSetBitSet(), setTraverser.getCount());
407411
// no need to set visited, as we are on a leaf
408412
}
409413

@@ -418,19 +422,17 @@ private static EclatResult eclat(
418422
*
419423
* iff the count of the subset is higher, collect
420424
*/
421-
if (setTraverser.hasPredecessorBeenVisited() == false
422-
&& setTraverser.getItemSet().length > minimumSetSize
423-
&& setTraverser.getCount() < setTraverser.getPreviousCount()) {
425+
if (setTraverser.hasParentBeenVisited() == false
426+
&& setTraverser.getItemSetBitSet().cardinality() > minimumSetSize
427+
&& setTraverser.getCount() < setTraverser.getParentCount()) {
424428
// add the set without the last item
425429

426-
LongsRef subItemSet = setTraverser.getItemSet().clone();
427-
subItemSet.length--;
428-
minCount = collector.add(subItemSet, setTraverser.getPreviousCount());
430+
minCount = collector.add(setTraverser.getParentItemSetBitSet(), setTraverser.getParentCount());
429431
}
430432

431433
// closed set criteria: the predecessor is no longer of interest: either we reported in the previous step or we found a
432434
// super set
433-
setTraverser.setPredecessorVisited();
435+
setTraverser.setParentVisited();
434436

435437
/**
436438
* Iff the traverser reached a leaf, the item set can not be further expanded, e.g. we reached [f]:
@@ -445,8 +447,8 @@ private static EclatResult eclat(
445447
*
446448
* Note: this also covers the last item, e.g. [a, x, y]
447449
*/
448-
if (setTraverser.atLeaf() && setTraverser.getItemSet().length >= minimumSetSize) {
449-
minCount = collector.add(setTraverser.getItemSet(), setTraverser.getCount());
450+
if (setTraverser.atLeaf() && setTraverser.getItemSetBitSet().cardinality() >= minimumSetSize) {
451+
minCount = collector.add(setTraverser.getItemSetBitSet(), setTraverser.getCount());
450452
// no need to set visited, as we are on a leaf
451453
}
452454

0 commit comments

Comments
 (0)