[Transform] improve continuous transform date_histogram group_by with ingest timestamps (#63315)

Hendrik Muhs · Hendrik Muhs · commit 8779f576818c · 2020-10-16T12:27:33.000+02:00
optimize continuous data histogram group_by for other time fields independent of sync, this allows the usage of ingest timestamps in continuous mode fixes #59061
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/GeoTileGridValuesSourceBuilder.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/GeoTileGridValuesSourceBuilder.java
@@ -132,7 +132,7 @@ static void register(ValuesSourceRegistry.Builder builder) {
     private int precision = GeoTileGridAggregationBuilder.DEFAULT_PRECISION;
     private GeoBoundingBox geoBoundingBox = new GeoBoundingBox(new GeoPoint(Double.NaN, Double.NaN), new GeoPoint(Double.NaN, Double.NaN));
 
-    GeoTileGridValuesSourceBuilder(String name) {
+    public GeoTileGridValuesSourceBuilder(String name) {
         super(name);
     }
 
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/DateHistogramGroupSource.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/DateHistogramGroupSource.java
@@ -348,9 +348,4 @@ public boolean equals(Object other) {
     public int hashCode() {
         return Objects.hash(field, scriptConfig, missingBucket, interval, timeZone);
     }
-
-    @Override
-    public boolean supportsIncrementalBucketUpdate() {
-        return false;
-    }
 }
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/GeoTileGroupSource.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/GeoTileGroupSource.java
@@ -106,11 +106,6 @@ public static GeoTileGroupSource fromXContent(final XContentParser parser, boole
         return lenient ? LENIENT_PARSER.apply(parser, null) : STRICT_PARSER.apply(parser, null);
     }
 
-    @Override
-    public boolean supportsIncrementalBucketUpdate() {
-        return true;
-    }
-
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
         builder.startObject();
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/HistogramGroupSource.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/HistogramGroupSource.java
@@ -101,9 +101,4 @@ public boolean equals(Object other) {
     public int hashCode() {
         return Objects.hash(field, scriptConfig, interval);
     }
-
-    @Override
-    public boolean supportsIncrementalBucketUpdate() {
-        return false;
-    }
 }
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/PivotConfig.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/PivotConfig.java
@@ -110,16 +110,12 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         return builder;
     }
 
-    public void toCompositeAggXContent(XContentBuilder builder, boolean forChangeDetection) throws IOException {
+    public void toCompositeAggXContent(XContentBuilder builder) throws IOException {
         builder.startObject();
         builder.field(CompositeAggregationBuilder.SOURCES_FIELD_NAME.getPreferredName());
         builder.startArray();
 
         for (Entry<String, SingleGroupSource> groupBy : groups.getGroups().entrySet()) {
-            // some group source do not implement change detection or not makes no sense, skip those
-            if (forChangeDetection && groupBy.getValue().supportsIncrementalBucketUpdate() == false) {
-                continue;
-            }
             builder.startObject();
             builder.startObject(groupBy.getKey());
             builder.field(groupBy.getValue().getType().value(), groupBy.getValue());
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/SingleGroupSource.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/SingleGroupSource.java
@@ -140,8 +140,6 @@ public void writeTo(StreamOutput out) throws IOException {
 
     public abstract Type getType();
 
-    public abstract boolean supportsIncrementalBucketUpdate();
-
     public String getField() {
         return field;
     }
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/TermsGroupSource.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/transform/transforms/pivot/TermsGroupSource.java
@@ -50,9 +50,4 @@ public Type getType() {
     public static TermsGroupSource fromXContent(final XContentParser parser, boolean lenient) throws IOException {
         return lenient ? LENIENT_PARSER.apply(parser, null) : STRICT_PARSER.apply(parser, null);
     }
-
-    @Override
-    public boolean supportsIncrementalBucketUpdate() {
-        return true;
-    }
 }
diff --git a/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/continuous/DateHistogramGroupByOtherTimeFieldIT.java b/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/continuous/DateHistogramGroupByOtherTimeFieldIT.java
@@ -0,0 +1,234 @@
+package org.elasticsearch.xpack.transform.integration.continuous;
+
+import org.elasticsearch.action.search.SearchRequest;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.action.support.IndicesOptions;
+import org.elasticsearch.client.transform.transforms.DestConfig;
+import org.elasticsearch.client.transform.transforms.SourceConfig;
+import org.elasticsearch.client.transform.transforms.TransformConfig;
+import org.elasticsearch.client.transform.transforms.pivot.DateHistogramGroupSource;
+import org.elasticsearch.client.transform.transforms.pivot.GroupConfig;
+import org.elasticsearch.client.transform.transforms.pivot.PivotConfig;
+import org.elasticsearch.client.transform.transforms.pivot.TermsGroupSource;
+import org.elasticsearch.common.xcontent.support.XContentMapValues;
+import org.elasticsearch.search.SearchHit;
+import org.elasticsearch.search.aggregations.AggregatorFactories;
+import org.elasticsearch.search.aggregations.BucketOrder;
+import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramAggregationBuilder;
+import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
+import org.elasticsearch.search.aggregations.bucket.histogram.Histogram;
+import org.elasticsearch.search.aggregations.bucket.histogram.Histogram.Bucket;
+import org.elasticsearch.search.aggregations.bucket.terms.Terms;
+import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
+import org.elasticsearch.search.builder.SearchSourceBuilder;
+
+import java.io.IOException;
+import java.time.ZonedDateTime;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
+
+/**
+ * Testcase for date histogram group_by on _different_ fields than used for sync
+ */
+public class DateHistogramGroupByOtherTimeFieldIT extends ContinuousTestCase {
+    private static final String NAME = "continuous-date-histogram-pivot-other-timefield-test";
+
+    private final boolean addGroupByTerms;
+
+    public DateHistogramGroupByOtherTimeFieldIT() {
+        addGroupByTerms = randomBoolean();
+    }
+
+    @Override
+    public TransformConfig createConfig() {
+        TransformConfig.Builder transformConfigBuilder = new TransformConfig.Builder();
+        addCommonBuilderParameters(transformConfigBuilder);
+        transformConfigBuilder.setSource(new SourceConfig(CONTINUOUS_EVENTS_SOURCE_INDEX));
+        transformConfigBuilder.setDest(new DestConfig(NAME, INGEST_PIPELINE));
+        transformConfigBuilder.setId(NAME);
+        PivotConfig.Builder pivotConfigBuilder = new PivotConfig.Builder();
+        GroupConfig.Builder groups = new GroupConfig.Builder().groupBy(
+            "second",
+            new DateHistogramGroupSource.Builder().setField("metric-timestamp")
+                .setInterval(new DateHistogramGroupSource.FixedInterval(DateHistogramInterval.SECOND))
+                .build()
+        );
+        if (addGroupByTerms) {
+            groups.groupBy("event", new TermsGroupSource.Builder().setField("event").build());
+        }
+        pivotConfigBuilder.setGroups(groups.build());
+        AggregatorFactories.Builder aggregations = new AggregatorFactories.Builder();
+        addCommonAggregations(aggregations);
+
+        pivotConfigBuilder.setAggregations(aggregations);
+        transformConfigBuilder.setPivotConfig(pivotConfigBuilder.build());
+        return transformConfigBuilder.build();
+    }
+
+    @Override
+    public String getName() {
+        return NAME;
+    }
+
+    @Override
+    public void testIteration(int iteration) throws IOException {
+        SearchRequest searchRequestSource = new SearchRequest(CONTINUOUS_EVENTS_SOURCE_INDEX).allowPartialSearchResults(false)
+            .indicesOptions(IndicesOptions.LENIENT_EXPAND_OPEN);
+        SearchSourceBuilder sourceBuilderSource = new SearchSourceBuilder().size(0);
+        DateHistogramAggregationBuilder bySecond = new DateHistogramAggregationBuilder("second").field("metric-timestamp")
+            .fixedInterval(DateHistogramInterval.SECOND)
+            .order(BucketOrder.key(true));
+
+        if (addGroupByTerms) {
+            TermsAggregationBuilder terms = new TermsAggregationBuilder("event").size(1000).field("event").order(BucketOrder.key(true));
+            bySecond.subAggregation(terms);
+        }
+        sourceBuilderSource.aggregation(bySecond);
+        searchRequestSource.source(sourceBuilderSource);
+        SearchResponse responseSource = search(searchRequestSource);
+
+        SearchRequest searchRequestDest = new SearchRequest(NAME).allowPartialSearchResults(false)
+            .indicesOptions(IndicesOptions.LENIENT_EXPAND_OPEN);
+        SearchSourceBuilder sourceBuilderDest = new SearchSourceBuilder().size(10000).sort("second");
+        if (addGroupByTerms) {
+            sourceBuilderDest.sort("event");
+        }
+
+        searchRequestDest.source(sourceBuilderDest);
+        SearchResponse responseDest = search(searchRequestDest);
+
+        if (addGroupByTerms) {
+            assertResultsGroupByDateHistogramAndTerms(iteration, responseSource, responseDest);
+        } else {
+            assertResultsGroupByDateHistogram(iteration, responseSource, responseDest);
+        }
+    }
+
+    private void assertResultsGroupByDateHistogram(int iteration, SearchResponse responseSource, SearchResponse responseDest) {
+        List<? extends Bucket> buckets = ((Histogram) responseSource.getAggregations().get("second")).getBuckets();
+        Iterator<? extends Bucket> sourceIterator = buckets.iterator();
+        Iterator<SearchHit> destIterator = responseDest.getHits().iterator();
+
+        while (sourceIterator.hasNext() && destIterator.hasNext()) {
+            Bucket bucket = sourceIterator.next();
+            SearchHit searchHit = destIterator.next();
+            Map<String, Object> source = searchHit.getSourceAsMap();
+
+            Long transformBucketKey = (Long) XContentMapValues.extractValue("second", source);
+
+            // aggs return buckets with 0 doc_count while composite aggs skip over them
+            while (bucket.getDocCount() == 0L) {
+                assertTrue(sourceIterator.hasNext());
+                bucket = sourceIterator.next();
+            }
+            long bucketKey = ((ZonedDateTime) bucket.getKey()).toEpochSecond() * 1000;
+
+            // test correctness, the results from the aggregation and the results from the transform should be the same
+            assertThat(
+                "Buckets did not match, source: " + source + ", expected: " + bucketKey + ", iteration: " + iteration,
+                transformBucketKey,
+                equalTo(bucketKey)
+            );
+            assertThat(
+                "Doc count did not match, source: " + source + ", expected: " + bucket.getDocCount() + ", iteration: " + iteration,
+                XContentMapValues.extractValue("count", source),
+                equalTo(Double.valueOf(bucket.getDocCount()))
+            );
+
+            // transform should only rewrite documents that require it
+            assertThat(
+                "Ingest run: "
+                    + XContentMapValues.extractValue(INGEST_RUN_FIELD, source)
+                    + " did not match max run: "
+                    + XContentMapValues.extractValue(MAX_RUN_FIELD, source)
+                    + ", iteration: "
+                    + iteration,
+                // we use a fixed_interval of `1s`, the transform runs every `1s`, a bucket might be recalculated at the next run
+                // but should NOT be recalculated for the 2nd/3rd/... run
+                Double.valueOf((Integer) XContentMapValues.extractValue(INGEST_RUN_FIELD, source)) - (Double) XContentMapValues
+                    .extractValue(MAX_RUN_FIELD, source),
+                is(lessThanOrEqualTo(1.0))
+            );
+
+        }
+        assertFalse(sourceIterator.hasNext());
+        assertFalse(destIterator.hasNext());
+    }
+
+    private void assertResultsGroupByDateHistogramAndTerms(int iteration, SearchResponse responseSource, SearchResponse responseDest) {
+        List<? extends Bucket> buckets = ((Histogram) responseSource.getAggregations().get("second")).getBuckets();
+
+        List<Map<String, Object>> flattenedBuckets = new ArrayList<>();
+        for (Bucket b : buckets) {
+            if (b.getDocCount() == 0) {
+                continue;
+            }
+            long second = ((ZonedDateTime) b.getKey()).toEpochSecond() * 1000;
+            List<? extends Terms.Bucket> terms = ((Terms) b.getAggregations().get("event")).getBuckets();
+            for (Terms.Bucket t : terms) {
+                flattenedBuckets.add(flattenedResult(second, t.getKeyAsString(), t.getDocCount()));
+            }
+        }
+
+        Iterator<Map<String, Object>> sourceIterator = flattenedBuckets.iterator();
+        Iterator<SearchHit> destIterator = responseDest.getHits().iterator();
+
+        while (sourceIterator.hasNext() && destIterator.hasNext()) {
+            Map<String, Object> bucket = sourceIterator.next();
+
+            SearchHit searchHit = destIterator.next();
+            Map<String, Object> source = searchHit.getSourceAsMap();
+
+            Long transformBucketKey = (Long) XContentMapValues.extractValue("second", source);
+
+            // test correctness, the results from the aggregation and the results from the transform should be the same
+            assertThat(
+                "Buckets did not match, source: " + source + ", expected: " + bucket.get("second") + ", iteration: " + iteration,
+                transformBucketKey,
+                equalTo(bucket.get("second"))
+            );
+            assertThat(
+                "Doc count did not match, source: " + source + ", expected: " + bucket.get("count") + ", iteration: " + iteration,
+                XContentMapValues.extractValue("count", source),
+                equalTo(Double.valueOf(((Long) bucket.get("count"))))
+            );
+            assertThat(
+                "Term did not match, source: " + source + ", expected: " + bucket.get("event") + ", iteration: " + iteration,
+                XContentMapValues.extractValue("event", source),
+                equalTo(bucket.get("event"))
+            );
+
+            // transform should only rewrite documents that require it
+            assertThat(
+                "Ingest run: "
+                    + XContentMapValues.extractValue(INGEST_RUN_FIELD, source)
+                    + " did not match max run: "
+                    + XContentMapValues.extractValue(MAX_RUN_FIELD, source)
+                    + ", iteration: "
+                    + iteration,
+                // we use a fixed_interval of `1s`, the transform runs every `1s`, a bucket might be recalculated at the next run
+                // but should NOT be recalculated for the 2nd/3rd/... run
+                Double.valueOf((Integer) XContentMapValues.extractValue(INGEST_RUN_FIELD, source)) - (Double) XContentMapValues
+                    .extractValue(MAX_RUN_FIELD, source),
+                is(lessThanOrEqualTo(1.0))
+            );
+        }
+        assertFalse(sourceIterator.hasNext());
+        assertFalse(destIterator.hasNext());
+    }
+
+    private static Map<String, Object> flattenedResult(long second, String event, long count) {
+        Map<String, Object> doc = new HashMap<>();
+        doc.put("second", second);
+        doc.put("event", event);
+        doc.put("count", count);
+        return doc;
+    }
+}
diff --git a/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/continuous/TransformContinuousIT.java b/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/continuous/TransformContinuousIT.java
@@ -125,6 +125,7 @@ public void setClusterSettings() throws IOException {
     public void registerTestCases() {
         addTestCaseIfNotDisabled(new TermsGroupByIT());
         addTestCaseIfNotDisabled(new DateHistogramGroupByIT());
+        addTestCaseIfNotDisabled(new DateHistogramGroupByOtherTimeFieldIT());
     }
 
     @Before
@@ -214,6 +215,11 @@ public void testContinousEvents() throws Exception {
                     source.append("\"location\":\"").append(randomizedLat + "," + randomizedLon).append("\",");
                 }
 
+                // simulate a different timestamp that is off from the timestamp used for sync, so it can fall into the previous bucket
+                String metric_date_string = ContinuousTestCase.STRICT_DATE_OPTIONAL_TIME_PRINTER_NANOS.withZone(ZoneId.of("UTC"))
+                    .format(runDate.minusSeconds(randomIntBetween(0, 5)).plusNanos(randomIntBetween(0, 999999)));
+                source.append("\"metric-timestamp\":\"").append(metric_date_string).append("\",");
+
                 String date_string = ContinuousTestCase.STRICT_DATE_OPTIONAL_TIME_PRINTER_NANOS.withZone(ZoneId.of("UTC"))
                     .format(runDate.plusNanos(randomIntBetween(0, 999999)));
 
@@ -318,6 +324,9 @@ private void putIndex(String indexName, String dateType, boolean isDataStream) t
                     .startObject("run")
                     .field("type", "integer")
                     .endObject()
+                    .startObject("metric-timestamp")
+                    .field("type", dateType)
+                    .endObject()
                     .endObject()
                     .endObject();
             }
diff --git a/x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/Function.java b/x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/Function.java
@@ -66,9 +66,9 @@ public interface ChangeCollector {
          * TODO: replace the boolean with a more descriptive enum.
          *
          * @param searchResponse the response after querying for changes
-         * @return true in case of no more changed buckets, false in case changes buckets have been collected
+         * @return the position of the change collector, null in case the collector is exhausted
          */
-        boolean processSearchResponse(SearchResponse searchResponse);
+        Map<String, Object> processSearchResponse(SearchResponse searchResponse);
 
         /**
          * Build the filter query to narrow the result set given the previously collected changes.
@@ -87,18 +87,18 @@ public interface ChangeCollector {
         void clear();
 
         /**
-         * Get the bucket position of the changes collector.
+         * Whether the collector optimizes change detection by narrowing the required query.
          *
-         * @return the position, null in case the collector is exhausted
+         * @return true if the collector optimizes change detection
          */
-        Map<String, Object> getBucketPosition();
+        boolean isOptimized();
 
         /**
-         * Whether the collector optimizes change detection by narrowing the required query.
+         * Whether the collector requires an extra query to identify the changes.
          *
-         * @return true if the collector optimizes change detection
+         * @return true if collector requires an extra query for identifying changes
          */
-        boolean isOptimized();
+        boolean queryForChanges();
     }
 
     /**
@@ -182,17 +182,6 @@ void preview(
      */
     int getInitialPageSize();
 
-    /**
-     * Whether this function - given its configuration - supports incremental bucket update used in continuous mode.
-     *
-     * If so, the indexer uses the change collector to update the continuous transform.
-     *
-     * TODO: simplify and remove this method if possible
-     *
-     * @return true if incremental bucket update is supported
-     */
-    boolean supportsIncrementalBucketUpdate();
-
     /**
      * Build the query for the next iteration
      *
diff --git a/x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/TransformIndexer.java b/x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/TransformIndexer.java
diff --git a/x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/pivot/CompositeBucketsChangeCollector.java b/x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/pivot/CompositeBucketsChangeCollector.java
diff --git a/x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/pivot/Pivot.java b/x-pack/plugin/transform/src/main/java/org/elasticsearch/xpack/transform/transforms/pivot/Pivot.java
diff --git a/x-pack/plugin/transform/src/test/java/org/elasticsearch/xpack/transform/transforms/pivot/CompositeBucketsChangeCollectorTests.java b/x-pack/plugin/transform/src/test/java/org/elasticsearch/xpack/transform/transforms/pivot/CompositeBucketsChangeCollectorTests.java

Original file line number	Diff line number	Diff line change
`@@ -132,7 +132,7 @@ static void register(ValuesSourceRegistry.Builder builder) {`
`132`	`132`	`private int precision = GeoTileGridAggregationBuilder.DEFAULT_PRECISION;`
`133`	`133`	`private GeoBoundingBox geoBoundingBox = new GeoBoundingBox(new GeoPoint(Double.NaN, Double.NaN), new GeoPoint(Double.NaN, Double.NaN));`
`134`	`134`
`135`		`- GeoTileGridValuesSourceBuilder(String name) {`
	`135`	`+ public GeoTileGridValuesSourceBuilder(String name) {`
`136`	`136`	`super(name);`
`137`	`137`	`}`
`138`	`138`
Original file line number	Diff line number	Diff line change
`@@ -348,9 +348,4 @@ public boolean equals(Object other) {`
`348`	`348`	`public int hashCode() {`
`349`	`349`	`return Objects.hash(field, scriptConfig, missingBucket, interval, timeZone);`
`350`	`350`	`}`
`351`		`-`
`352`		`- @Override`
`353`		`- public boolean supportsIncrementalBucketUpdate() {`
`354`		`- return false;`
`355`		`- }`
`356`	`351`	`}`
Original file line number	Diff line number	Diff line change
`@@ -101,9 +101,4 @@ public boolean equals(Object other) {`
`101`	`101`	`public int hashCode() {`
`102`	`102`	`return Objects.hash(field, scriptConfig, interval);`
`103`	`103`	`}`
`104`		`-`
`105`		`- @Override`
`106`		`- public boolean supportsIncrementalBucketUpdate() {`
`107`		`- return false;`
`108`		`- }`
`109`	`104`	`}`
Original file line number	Diff line number	Diff line change
`@@ -140,8 +140,6 @@ public void writeTo(StreamOutput out) throws IOException {`
`140`	`140`
`141`	`141`	`public abstract Type getType();`
`142`	`142`
`143`		`- public abstract boolean supportsIncrementalBucketUpdate();`
`144`		`-`
`145`	`143`	`public String getField() {`
`146`	`144`	`return field;`
`147`	`145`	`}`
Original file line number	Diff line number	Diff line change
`@@ -50,9 +50,4 @@ public Type getType() {`
`50`	`50`	`public static TermsGroupSource fromXContent(final XContentParser parser, boolean lenient) throws IOException {`
`51`	`51`	`return lenient ? LENIENT_PARSER.apply(parser, null) : STRICT_PARSER.apply(parser, null);`
`52`	`52`	`}`
`53`		`-`
`54`		`- @Override`
`55`		`- public boolean supportsIncrementalBucketUpdate() {`
`56`		`- return true;`
`57`		`- }`
`58`	`53`	`}`