Skip to content

Commit a74573b

Browse files
Expand segment sorter for all timeseries indices (#78639)
PR #75195 added segment sorter on @timestamp desc for datastream indices. This PR applies segment sorter to all indices that have @timestamp field. The presence of @timestamp field can serve as a strong indication that we are dealing with timeseries indices. The most common type of query for timeseries indices is to get the latest data, that is data sorted by @timestamp desc. This PR sorts segments by @timestamp desc which allows to speed up this kind of queries. Relates to #75195
1 parent 841c544 commit a74573b

File tree

6 files changed

+176
-119
lines changed

6 files changed

+176
-119
lines changed
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
---
2+
"Test that index segments are sorted on timestamp field if @timestamp field is defined in mapping":
3+
- skip:
4+
version: " - 7.99.99"
5+
reason: "sorting segments was added in 7.16"
6+
features: allowed_warnings
7+
8+
- do:
9+
indices.create:
10+
index: test_index1
11+
body:
12+
mappings:
13+
properties:
14+
"@timestamp":
15+
type: date
16+
settings:
17+
number_of_shards: 1
18+
number_of_replicas: 0
19+
20+
# 1st segment
21+
- do:
22+
index:
23+
index: test_index1
24+
body: { "foo": "bar1", "@timestamp": "2021-08-01" }
25+
refresh: true
26+
27+
# 2nd segment
28+
- do:
29+
index:
30+
index: test_index1
31+
body: { "foo": "bar2", "@timestamp": "2021-08-02" }
32+
refresh: true
33+
34+
# test that segments are sorted by @timestamp DESC
35+
- do:
36+
search:
37+
index: test_index1
38+
body:
39+
fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
40+
- match: { hits.total.value: 2 }
41+
- match: { hits.hits.0.fields.@timestamp: ["2021-08-02"] }
42+
- match: { hits.hits.1.fields.@timestamp: ["2021-08-01"] }
43+
44+
---
45+
"Test that index segments are NOT sorted on timestamp field when @timestamp field is dynamically added":
46+
- skip:
47+
version: " - 7.99.99"
48+
reason: "sorting segments was added in 7.16"
49+
features: allowed_warnings
50+
51+
- do:
52+
indices.create:
53+
index: test_index2
54+
body:
55+
settings:
56+
number_of_shards: 1
57+
number_of_replicas: 0
58+
59+
# 1st segment
60+
- do:
61+
index:
62+
index: test_index2
63+
body: { "foo": "bar1", "@timestamp": "2021-08-01" }
64+
refresh: true
65+
66+
# 2nd segment
67+
- do:
68+
index:
69+
index: test_index2
70+
body: { "foo": "bar2", "@timestamp": "2021-08-02" }
71+
refresh: true
72+
73+
# test that segments are NOT sorted by @timestamp DESC as the field was not
74+
- do:
75+
search:
76+
index: test_index2
77+
body:
78+
fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
79+
- match: { hits.hits.0.fields.@timestamp: ["2021-08-01"] }
80+
- match: { hits.hits.1.fields.@timestamp: ["2021-08-02"] }
81+
82+
# test that after we reopen the index, segments are sorted by @timestamp DESC
83+
- do:
84+
indices.close:
85+
index: test_index2
86+
- is_true: acknowledged
87+
- do:
88+
indices.open:
89+
index: test_index2
90+
- is_true: acknowledged
91+
- do:
92+
search:
93+
index: test_index2
94+
body:
95+
fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
96+
- match: { hits.total.value: 2 }
97+
- match: { hits.hits.0.fields.@timestamp: ["2021-08-02"] }
98+
- match: { hits.hits.1.fields.@timestamp: ["2021-08-01"] }
99+
100+
---
101+
"Test if segments are missing @timestamp field we don't get errors":
102+
- skip:
103+
version: " - 7.99.99"
104+
reason: "sorting segments was added in 7.16"
105+
features: allowed_warnings
106+
107+
- do:
108+
indices.create:
109+
index: test_index3
110+
body:
111+
mappings:
112+
properties:
113+
"@timestamp":
114+
type: date
115+
settings:
116+
number_of_shards: 1
117+
number_of_replicas: 0
118+
119+
# 1st segment missing @timestamp field
120+
- do:
121+
index:
122+
index: test_index3
123+
body: { "foo": "bar1"}
124+
refresh: true
125+
126+
# 2nd segment
127+
- do:
128+
index:
129+
index: test_index3
130+
body: { "foo": "bar2", "@timestamp": "2021-08-02" }
131+
refresh: true
132+
133+
- do:
134+
search:
135+
index: test_index3
136+
body:
137+
fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
138+
- match: { hits.hits.0.fields.@timestamp: ["2021-08-02"] }
139+
- is_false: hits.hits.1.fields.@timestamp

server/src/main/java/org/elasticsearch/cluster/metadata/DataStream.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import org.apache.lucene.document.LongPoint;
1111
import org.apache.lucene.index.LeafReader;
1212
import org.apache.lucene.index.PointValues;
13+
import org.elasticsearch.ElasticsearchException;
1314
import org.elasticsearch.cluster.AbstractDiffable;
1415
import org.elasticsearch.cluster.Diff;
1516
import org.elasticsearch.common.Strings;
@@ -42,23 +43,26 @@ public final class DataStream extends AbstractDiffable<DataStream> implements To
4243

4344
public static final String BACKING_INDEX_PREFIX = ".ds-";
4445
public static final DateFormatter DATE_FORMATTER = DateFormatter.forPattern("uuuu.MM.dd");
45-
// Datastreams' leaf readers should be sorted by desc order of their timestamp field, as it allows search time optimizations
46-
public static Comparator<LeafReader> DATASTREAM_LEAF_READERS_SORTER =
46+
// Timeseries indices' leaf readers should be sorted by desc order of their timestamp field, as it allows search time optimizations
47+
public static Comparator<LeafReader> TIMESERIES_LEAF_READERS_SORTER =
4748
Comparator.comparingLong(
4849
(LeafReader r) -> {
4950
try {
5051
PointValues points = r.getPointValues(DataStream.TimestampField.FIXED_TIMESTAMP_FIELD);
5152
if (points != null) {
5253
byte[] sortValue = points.getMaxPackedValue();
5354
return LongPoint.decodeDimension(sortValue, 0);
54-
} else if (r.numDocs() == 0) {
55-
// points can be null if the segment contains only deleted documents
55+
} else {
56+
// As we apply this segment sorter to any timeseries indices,
57+
// we don't have a guarantee that all docs contain @timestamp field.
58+
// Some segments may have all docs without @timestamp field, in this
59+
// case they will be sorted last.
5660
return Long.MIN_VALUE;
5761
}
5862
} catch (IOException e) {
63+
throw new ElasticsearchException("Can't access [" +
64+
DataStream.TimestampField.FIXED_TIMESTAMP_FIELD + "] field for the index!", e);
5965
}
60-
throw new IllegalStateException("Can't access [" +
61-
DataStream.TimestampField.FIXED_TIMESTAMP_FIELD + "] field for the data stream!");
6266
})
6367
.reversed();
6468

server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
package org.elasticsearch.index.mapper;
1010

1111
import org.apache.lucene.codecs.PostingsFormat;
12+
import org.elasticsearch.cluster.metadata.DataStream;
1213
import org.elasticsearch.index.IndexSettings;
1314
import org.elasticsearch.index.analysis.IndexAnalyzers;
1415
import org.elasticsearch.index.analysis.NamedAnalyzer;
@@ -389,6 +390,19 @@ public boolean isDataStreamTimestampFieldEnabled() {
389390
return dtfm != null && dtfm.isEnabled();
390391
}
391392

393+
/**
394+
* Returns if this mapping contains a timestamp field that is of type date, indexed and has doc values.
395+
* @return {@code true} if contains a timestamp field of type date that is indexed and has doc values, {@code false} otherwise.
396+
*/
397+
public boolean hasTimestampField() {
398+
final MappedFieldType mappedFieldType = fieldTypesLookup().get(DataStream.TimestampField.FIXED_TIMESTAMP_FIELD);
399+
if (mappedFieldType instanceof DateFieldMapper.DateFieldType) {
400+
return mappedFieldType.isSearchable() && mappedFieldType.hasDocValues();
401+
} else {
402+
return false;
403+
}
404+
}
405+
392406
/**
393407
* Key for the lookup to be used in caches.
394408
*/

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@
186186
import java.util.stream.Collectors;
187187
import java.util.stream.StreamSupport;
188188

189-
import static org.elasticsearch.cluster.metadata.DataStream.DATASTREAM_LEAF_READERS_SORTER;
189+
import static org.elasticsearch.cluster.metadata.DataStream.TIMESERIES_LEAF_READERS_SORTER;
190190
import static org.elasticsearch.index.seqno.RetentionLeaseActions.RETAIN_ALL;
191191
import static org.elasticsearch.index.seqno.SequenceNumbers.UNASSIGNED_SEQ_NO;
192192

@@ -408,6 +408,14 @@ public Sort getIndexSort() {
408408
return indexSortSupplier.get();
409409
}
410410

411+
/**
412+
* Returns if this shard is a part of datastream
413+
* @return {@code true} if this shard is a part of datastream, {@code false} otherwise
414+
*/
415+
public boolean isDataStreamIndex() {
416+
return isDataStreamIndex;
417+
}
418+
411419
public ShardGetService getService() {
412420
return this.getService;
413421
}
@@ -2905,6 +2913,7 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) {
29052913
this.warmer.warm(reader);
29062914
}
29072915
};
2916+
final boolean isTimeseriesIndex = mapperService == null ? false : mapperService.mappingLookup().hasTimestampField();
29082917
return new EngineConfig(
29092918
shardId,
29102919
threadPool,
@@ -2928,7 +2937,7 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) {
29282937
replicationTracker::getRetentionLeases,
29292938
this::getOperationPrimaryTerm,
29302939
snapshotCommitSupplier,
2931-
isDataStreamIndex ? DATASTREAM_LEAF_READERS_SORTER : null);
2940+
isTimeseriesIndex ? TIMESERIES_LEAF_READERS_SORTER : null);
29322941
}
29332942

29342943
/**

x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/data_stream/131_sort_segments_migrate_to_data_stream.yml

Lines changed: 0 additions & 111 deletions
This file was deleted.

x-pack/qa/runtime-fields/build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ subprojects {
9696
// The error messages are different
9797
'search/330_fetch_fields/error includes field name',
9898
'search/330_fetch_fields/error includes glob pattern',
99+
// we need a @timestamp field to be defined in index mapping
100+
'search/380_sort_segments_on_timestamp/*',
99101
/////// NOT SUPPORTED ///////
100102
].join(',')
101103
}

0 commit comments

Comments
 (0)