Add a time series aggregation implementation to make use of the fact that docids are emitted in tsid and parent bucket ordinal.

martijnvg · martijnvg · commit 29287d13754a · 2022-11-23T14:04:08.000+01:00
This is true when the parent aggregation is data histogram (which is typical), due to the fact that TimeSeriesIndexSearcher emits docs in tsid and timestamp order. Relates to #74660
diff --git a/modules/aggregations/src/main/java/org/elasticsearch/aggregations/bucket/timeseries/TimeSeriesAggregationBuilder.java b/modules/aggregations/src/main/java/org/elasticsearch/aggregations/bucket/timeseries/TimeSeriesAggregationBuilder.java
@@ -15,6 +15,7 @@
 import org.elasticsearch.search.aggregations.AggregationBuilder;
 import org.elasticsearch.search.aggregations.AggregatorFactories;
 import org.elasticsearch.search.aggregations.AggregatorFactory;
+import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramAggregatorFactory;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.xcontent.InstantiatingObjectParser;
 import org.elasticsearch.xcontent.ParseField;
@@ -79,7 +80,8 @@ protected AggregatorFactory doBuild(
         AggregatorFactory parent,
         AggregatorFactories.Builder subFactoriesBuilder
     ) throws IOException {
-        return new TimeSeriesAggregationFactory(name, keyed, context, parent, subFactoriesBuilder, metadata);
+        boolean expectTsidBucketInOrder = parent instanceof DateHistogramAggregatorFactory;
+        return new TimeSeriesAggregationFactory(name, keyed, context, parent, subFactoriesBuilder, metadata, expectTsidBucketInOrder);
     }
 
     @Override
diff --git a/modules/aggregations/src/main/java/org/elasticsearch/aggregations/bucket/timeseries/TimeSeriesAggregationFactory.java b/modules/aggregations/src/main/java/org/elasticsearch/aggregations/bucket/timeseries/TimeSeriesAggregationFactory.java
@@ -20,22 +20,28 @@
 public class TimeSeriesAggregationFactory extends AggregatorFactory {
 
     private final boolean keyed;
+    private final boolean expectTsidBucketInOrder;
 
     public TimeSeriesAggregationFactory(
         String name,
         boolean keyed,
         AggregationContext context,
         AggregatorFactory parent,
         AggregatorFactories.Builder subFactoriesBuilder,
-        Map<String, Object> metadata
-    ) throws IOException {
+        Map<String, Object> metadata,
+        boolean expectTsidBucketInOrder) throws IOException {
         super(name, context, parent, subFactoriesBuilder, metadata);
         this.keyed = keyed;
+        this.expectTsidBucketInOrder = expectTsidBucketInOrder;
     }
 
     @Override
     protected Aggregator createInternal(Aggregator parent, CardinalityUpperBound cardinality, Map<String, Object> metadata)
         throws IOException {
-        return new TimeSeriesAggregator(name, factories, keyed, context, parent, cardinality, metadata);
+        if (expectTsidBucketInOrder) {
+            return new TimeSeriesInOrderAggregator(name, factories, keyed, context, parent, cardinality, metadata);
+        } else {
+            return new TimeSeriesAggregator(name, factories, keyed, context, parent, cardinality, metadata);
+        }
     }
 }
diff --git a/modules/aggregations/src/main/java/org/elasticsearch/aggregations/bucket/timeseries/TimeSeriesInOrderAggregator.java b/modules/aggregations/src/main/java/org/elasticsearch/aggregations/bucket/timeseries/TimeSeriesInOrderAggregator.java
@@ -0,0 +1,153 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.aggregations.bucket.timeseries;
+
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.util.BytesRefArray;
+import org.elasticsearch.common.util.LongObjectPagedHashMap;
+import org.elasticsearch.core.Releasables;
+import org.elasticsearch.search.aggregations.AggregationExecutionContext;
+import org.elasticsearch.search.aggregations.Aggregator;
+import org.elasticsearch.search.aggregations.AggregatorFactories;
+import org.elasticsearch.search.aggregations.CardinalityUpperBound;
+import org.elasticsearch.search.aggregations.InternalAggregation;
+import org.elasticsearch.search.aggregations.InternalAggregations;
+import org.elasticsearch.search.aggregations.LeafBucketCollector;
+import org.elasticsearch.search.aggregations.LeafBucketCollectorBase;
+import org.elasticsearch.search.aggregations.bucket.BucketsAggregator;
+import org.elasticsearch.search.aggregations.support.AggregationContext;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class TimeSeriesInOrderAggregator extends BucketsAggregator {
+
+    // reuse tsids between owning bucket ordinals:
+    private final BytesRefArray collectedTsids;
+    private final LongObjectPagedHashMap<List<InternalBucket>> results;
+    private final boolean keyed;
+
+    public TimeSeriesInOrderAggregator(
+        String name,
+        AggregatorFactories factories,
+        boolean keyed,
+        AggregationContext context,
+        Aggregator parent,
+        CardinalityUpperBound cardinality,
+        Map<String, Object> metadata
+    ) throws IOException {
+        super(name, factories, context, parent, cardinality, metadata);
+        this.keyed = keyed;
+        this.results = new LongObjectPagedHashMap<>(1, context.bigArrays());
+        this.collectedTsids = new BytesRefArray(1, context.bigArrays());
+    }
+
+    @Override
+    public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws IOException {
+        InternalAggregation[] result = new InternalAggregation[owningBucketOrds.length];
+        for (int ordIdx = 0; ordIdx < owningBucketOrds.length; ordIdx++) {
+            long owningOrdinal = owningBucketOrds[ordIdx];
+            List<InternalBucket> internalBuckets = results.get(owningOrdinal);
+            if (internalBuckets != null) {
+                BytesRef spare = new BytesRef();
+                List<InternalTimeSeries.InternalBucket> buckets = new ArrayList<>(internalBuckets.size());
+                for (InternalBucket internalBucket : internalBuckets) {
+                    BytesRef key = collectedTsids.get(internalBucket.tsidOffset, spare);
+                    InternalAggregations internalAggregations = buildSubAggsForBuckets(new long[] { internalBucket.bucketOrd })[0];
+                    buckets.add(
+                        new InternalTimeSeries.InternalBucket(
+                            BytesRef.deepCopyOf(key),
+                            internalBucket.docCount,
+                            internalAggregations,
+                            keyed
+                        )
+                    );
+                }
+                result[ordIdx] = new InternalTimeSeries(name, buckets, keyed, metadata());
+            } else {
+                result[ordIdx] = buildEmptyAggregation();
+            }
+        }
+        return result;
+    }
+
+    @Override
+    public InternalAggregation buildEmptyAggregation() {
+        return new InternalTimeSeries(name, new ArrayList<>(), false, metadata());
+    }
+
+    private BytesRef currentTsid;
+    private int currentTsidOrd = -1;
+    private long currentParentBucket = -1;
+    private long docCount;
+    // TODO use 0L as bucket ordinal and clear sub aggregations after bucket/parent bucket ordinal combination changes
+    // Ideally use a constant ordinal (0) here and tsid or parent bucket change reset sub and
+    // reuse the same ordinal. This is possible because a tsid / parent bucket ordinal are unique and
+    // don't reappear when either one changes.
+    private long bucketOrdinalGenerator;
+
+    @Override
+    protected LeafBucketCollector getLeafCollector(AggregationExecutionContext aggCtx, LeafBucketCollector sub) {
+        return new LeafBucketCollectorBase(sub, null) {
+
+            @Override
+            public void collect(int doc, long bucket) throws IOException {
+                if (currentTsidOrd == aggCtx.getTsidOrd() && currentParentBucket == bucket) {
+                    docCount++;
+                    sub.collect(doc, bucketOrdinalGenerator);
+                    return;
+                }
+                if (currentTsid != null) {
+                    completeBucket();
+                    bucketOrdinalGenerator++;
+                }
+                if (currentTsidOrd != aggCtx.getTsidOrd()) {
+                    currentTsidOrd = aggCtx.getTsidOrd();
+                    currentTsid = aggCtx.getTsid();
+
+                    collectedTsids.append(currentTsid);
+                }
+                if (currentParentBucket != bucket) {
+                    currentParentBucket = bucket;
+                }
+
+                docCount = 1;
+                sub.collect(doc, bucketOrdinalGenerator);
+            }
+        };
+    }
+
+    @Override
+    protected void doPostCollection() {
+        if (currentTsid != null) {
+            completeBucket();
+        }
+    }
+
+    private void completeBucket() {
+        InternalBucket bucket = new InternalBucket(collectedTsids.size() - 1, bucketOrdinalGenerator, docCount);
+        // TODO: instead of collecting all buckets, perform pipeline aggregations here:
+        // (Then we don't need to keep all these buckets in memory)
+        List<InternalBucket> result = results.get(currentParentBucket);
+        if (result == null) {
+            result = new ArrayList<>();
+            results.put(currentParentBucket, result);
+        }
+        result.add(bucket);
+    }
+
+    @Override
+    protected void doClose() {
+        Releasables.close(results, collectedTsids);
+    }
+
+    record InternalBucket(long tsidOffset, long bucketOrd, long docCount) {}
+}