Skip to content

Commit 3ed42f3

Browse files
Add data-stream auto-sharding APM metrics (#107593)
Add APM metrics to monitor data stream auto-sharding events. The new metrics are: - es.auto_sharding.increase_shards.total - es.auto_sharding.decrease_shards.total - es.auto_sharding.cooldown_prevented_increase.total - es.auto_sharding.cooldown_prevented_decrease.total The first two track situations where the shards increase or decrease during a rollover. The latter two events track when the auto-sharding logic recommends an increase or decrease but the shard change did not take place because we are in a cooldown period due to a recent increase or decrease auto-sharding event.
1 parent 0c41cb7 commit 3ed42f3

File tree

10 files changed

+291
-22
lines changed

10 files changed

+291
-22
lines changed

docs/changelog/107593.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 107593
2+
summary: Add auto-sharding APM metrics
3+
area: Infra/Metrics
4+
type: enhancement
5+
issues: []

modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/DataStreamAutoshardingIT.java

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
1212
import org.elasticsearch.action.admin.indices.rollover.Condition;
1313
import org.elasticsearch.action.admin.indices.rollover.MaxDocsCondition;
14+
import org.elasticsearch.action.admin.indices.rollover.MetadataRolloverService;
1415
import org.elasticsearch.action.admin.indices.rollover.OptimalShardCountCondition;
1516
import org.elasticsearch.action.admin.indices.rollover.RolloverConditions;
1617
import org.elasticsearch.action.admin.indices.rollover.RolloverInfo;
@@ -25,6 +26,7 @@
2526
import org.elasticsearch.action.bulk.BulkRequest;
2627
import org.elasticsearch.action.bulk.BulkResponse;
2728
import org.elasticsearch.action.datastreams.CreateDataStreamAction;
29+
import org.elasticsearch.action.datastreams.autosharding.AutoShardingType;
2830
import org.elasticsearch.action.datastreams.autosharding.DataStreamAutoShardingService;
2931
import org.elasticsearch.action.index.IndexRequest;
3032
import org.elasticsearch.cluster.ClusterState;
@@ -49,7 +51,11 @@
4951
import org.elasticsearch.index.shard.ShardPath;
5052
import org.elasticsearch.index.store.StoreStats;
5153
import org.elasticsearch.plugins.Plugin;
54+
import org.elasticsearch.plugins.PluginsService;
5255
import org.elasticsearch.rest.RestStatus;
56+
import org.elasticsearch.telemetry.InstrumentType;
57+
import org.elasticsearch.telemetry.Measurement;
58+
import org.elasticsearch.telemetry.TestTelemetryPlugin;
5359
import org.elasticsearch.test.ESIntegTestCase;
5460
import org.elasticsearch.test.transport.MockTransportService;
5561
import org.elasticsearch.xcontent.XContentType;
@@ -60,14 +66,17 @@
6066
import java.nio.file.Path;
6167
import java.util.ArrayList;
6268
import java.util.Collection;
69+
import java.util.HashMap;
6370
import java.util.List;
6471
import java.util.Locale;
6572
import java.util.Map;
6673

6774
import static org.elasticsearch.action.datastreams.autosharding.DataStreamAutoShardingService.DATA_STREAMS_AUTO_SHARDING_ENABLED;
6875
import static org.elasticsearch.cluster.metadata.MetadataIndexTemplateService.DEFAULT_TIMESTAMP_FIELD;
6976
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
77+
import static org.hamcrest.Matchers.empty;
7078
import static org.hamcrest.Matchers.equalTo;
79+
import static org.hamcrest.Matchers.hasSize;
7180
import static org.hamcrest.Matchers.instanceOf;
7281
import static org.hamcrest.Matchers.is;
7382
import static org.hamcrest.Matchers.nullValue;
@@ -77,7 +86,12 @@ public class DataStreamAutoshardingIT extends ESIntegTestCase {
7786

7887
@Override
7988
protected Collection<Class<? extends Plugin>> nodePlugins() {
80-
return List.of(DataStreamsPlugin.class, MockTransportService.TestPlugin.class, TestAutoshardingPlugin.class);
89+
return List.of(
90+
DataStreamsPlugin.class,
91+
MockTransportService.TestPlugin.class,
92+
TestAutoshardingPlugin.class,
93+
TestTelemetryPlugin.class
94+
);
8195
}
8296

8397
@Before
@@ -109,6 +123,7 @@ public void testRolloverOnAutoShardCondition() throws Exception {
109123
indexDocs(dataStreamName, randomIntBetween(100, 200));
110124

111125
{
126+
resetTelemetry();
112127
ClusterState clusterStateBeforeRollover = internalCluster().getCurrentMasterNodeInstance(ClusterService.class).state();
113128
DataStream dataStreamBeforeRollover = clusterStateBeforeRollover.getMetadata().dataStreams().get(dataStreamName);
114129
String assignedShardNodeId = clusterStateBeforeRollover.routingTable()
@@ -152,11 +167,14 @@ public void testRolloverOnAutoShardCondition() throws Exception {
152167
assertThat(metConditions.get(0).value(), instanceOf(Integer.class));
153168
int autoShardingRolloverInfo = (int) metConditions.get(0).value();
154169
assertThat(autoShardingRolloverInfo, is(5));
170+
171+
assertTelemetry(MetadataRolloverService.AUTO_SHARDING_METRIC_NAMES.get(AutoShardingType.INCREASE_SHARDS));
155172
}
156173

157174
// let's do another rollover now that will not increase the number of shards because the increase shards cooldown has not lapsed,
158175
// however the rollover will use the existing/previous auto shard configuration and the new generation index will have 5 shards
159176
{
177+
resetTelemetry();
160178
ClusterState clusterStateBeforeRollover = internalCluster().getCurrentMasterNodeInstance(ClusterService.class).state();
161179
DataStream dataStreamBeforeRollover = clusterStateBeforeRollover.getMetadata().dataStreams().get(dataStreamName);
162180
String assignedShardNodeId = clusterStateBeforeRollover.routingTable()
@@ -193,6 +211,8 @@ public void testRolloverOnAutoShardCondition() throws Exception {
193211

194212
// we remained on 5 shards due to the increase shards cooldown
195213
assertThat(thirdGenerationMeta.getNumberOfShards(), is(5));
214+
215+
assertTelemetry(MetadataRolloverService.AUTO_SHARDING_METRIC_NAMES.get(AutoShardingType.COOLDOWN_PREVENTED_INCREASE));
196216
}
197217

198218
{
@@ -566,4 +586,44 @@ private static void mockStatsForIndex(
566586
}
567587
}
568588
}
589+
590+
private static void resetTelemetry() {
591+
for (PluginsService pluginsService : internalCluster().getInstances(PluginsService.class)) {
592+
final TestTelemetryPlugin telemetryPlugin = pluginsService.filterPlugins(TestTelemetryPlugin.class).findFirst().orElseThrow();
593+
telemetryPlugin.resetMeter();
594+
}
595+
}
596+
597+
private static void assertTelemetry(String expectedEmittedMetric) {
598+
Map<String, List<Measurement>> measurements = new HashMap<>();
599+
for (PluginsService pluginsService : internalCluster().getInstances(PluginsService.class)) {
600+
final TestTelemetryPlugin telemetryPlugin = pluginsService.filterPlugins(TestTelemetryPlugin.class).findFirst().orElseThrow();
601+
602+
telemetryPlugin.collect();
603+
604+
List<String> autoShardingMetrics = telemetryPlugin.getRegisteredMetrics(InstrumentType.LONG_COUNTER)
605+
.stream()
606+
.filter(metric -> metric.startsWith("es.auto_sharding."))
607+
.sorted()
608+
.toList();
609+
610+
assertEquals(autoShardingMetrics, MetadataRolloverService.AUTO_SHARDING_METRIC_NAMES.values().stream().sorted().toList());
611+
612+
for (String metricName : MetadataRolloverService.AUTO_SHARDING_METRIC_NAMES.values()) {
613+
measurements.computeIfAbsent(metricName, n -> new ArrayList<>())
614+
.addAll(telemetryPlugin.getLongCounterMeasurement(metricName));
615+
}
616+
}
617+
618+
// assert other metrics not emitted
619+
MetadataRolloverService.AUTO_SHARDING_METRIC_NAMES.values()
620+
.stream()
621+
.filter(metric -> metric.equals(expectedEmittedMetric) == false)
622+
.forEach(metric -> assertThat(measurements.get(metric), empty()));
623+
624+
assertThat(measurements.get(expectedEmittedMetric), hasSize(1));
625+
Measurement measurement = measurements.get(expectedEmittedMetric).get(0);
626+
assertThat(measurement.getLong(), is(1L));
627+
assertFalse(measurement.isDouble());
628+
}
569629
}

modules/data-streams/src/test/java/org/elasticsearch/datastreams/DataStreamGetWriteIndexTests.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import org.elasticsearch.indices.IndicesService;
5050
import org.elasticsearch.indices.ShardLimitValidator;
5151
import org.elasticsearch.script.ScriptCompiler;
52+
import org.elasticsearch.telemetry.TestTelemetryPlugin;
5253
import org.elasticsearch.test.ClusterServiceUtils;
5354
import org.elasticsearch.test.ESTestCase;
5455
import org.elasticsearch.threadpool.TestThreadPool;
@@ -272,13 +273,15 @@ public void setup() throws Exception {
272273
indicesService,
273274
xContentRegistry()
274275
);
276+
TestTelemetryPlugin telemetryPlugin = new TestTelemetryPlugin();
275277
rolloverService = new MetadataRolloverService(
276278
testThreadPool,
277279
createIndexService,
278280
indexAliasesService,
279281
EmptySystemIndices.INSTANCE,
280282
WriteLoadForecaster.DEFAULT,
281-
clusterService
283+
clusterService,
284+
telemetryPlugin.getTelemetryProvider(Settings.EMPTY)
282285
);
283286
}
284287

modules/data-streams/src/test/java/org/elasticsearch/datastreams/MetadataDataStreamRolloverServiceTests.java

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.elasticsearch.index.IndexVersion;
3232
import org.elasticsearch.index.IndexVersions;
3333
import org.elasticsearch.index.MapperTestUtils;
34+
import org.elasticsearch.telemetry.TestTelemetryPlugin;
3435
import org.elasticsearch.test.ESTestCase;
3536
import org.elasticsearch.threadpool.TestThreadPool;
3637
import org.elasticsearch.threadpool.ThreadPool;
@@ -88,14 +89,16 @@ public void testRolloverClusterStateForDataStream() throws Exception {
8889
);
8990
builder.put(dataStream);
9091
final ClusterState clusterState = ClusterState.builder(new ClusterName("test")).metadata(builder).build();
92+
final TestTelemetryPlugin telemetryPlugin = new TestTelemetryPlugin();
9193

9294
ThreadPool testThreadPool = new TestThreadPool(getTestName());
9395
try {
9496
MetadataRolloverService rolloverService = DataStreamTestHelper.getMetadataRolloverService(
9597
dataStream,
9698
testThreadPool,
9799
Set.of(createSettingsProvider(xContentRegistry())),
98-
xContentRegistry()
100+
xContentRegistry(),
101+
telemetryPlugin.getTelemetryProvider(Settings.EMPTY)
99102
);
100103
MaxDocsCondition condition = new MaxDocsCondition(randomNonNegativeLong());
101104
List<Condition<?>> metConditions = Collections.singletonList(condition);
@@ -184,14 +187,16 @@ public void testRolloverAndMigrateDataStream() throws Exception {
184187
);
185188
builder.put(dataStream);
186189
final ClusterState clusterState = ClusterState.builder(new ClusterName("test")).metadata(builder).build();
190+
final TestTelemetryPlugin telemetryPlugin = new TestTelemetryPlugin();
187191

188192
ThreadPool testThreadPool = new TestThreadPool(getTestName());
189193
try {
190194
MetadataRolloverService rolloverService = DataStreamTestHelper.getMetadataRolloverService(
191195
dataStream,
192196
testThreadPool,
193197
Set.of(createSettingsProvider(xContentRegistry())),
194-
xContentRegistry()
198+
xContentRegistry(),
199+
telemetryPlugin.getTelemetryProvider(Settings.EMPTY)
195200
);
196201
MaxDocsCondition condition = new MaxDocsCondition(randomNonNegativeLong());
197202
List<Condition<?>> metConditions = Collections.singletonList(condition);
@@ -271,14 +276,15 @@ public void testChangingIndexModeFromTimeSeriesToSomethingElseNoEffectOnExisting
271276
);
272277
builder.put(dataStream);
273278
final ClusterState clusterState = ClusterState.builder(new ClusterName("test")).metadata(builder).build();
274-
279+
final TestTelemetryPlugin telemetryPlugin = new TestTelemetryPlugin();
275280
ThreadPool testThreadPool = new TestThreadPool(getTestName());
276281
try {
277282
MetadataRolloverService rolloverService = DataStreamTestHelper.getMetadataRolloverService(
278283
dataStream,
279284
testThreadPool,
280285
Set.of(createSettingsProvider(xContentRegistry())),
281-
xContentRegistry()
286+
xContentRegistry(),
287+
telemetryPlugin.getTelemetryProvider(Settings.EMPTY)
282288
);
283289
MaxDocsCondition condition = new MaxDocsCondition(randomNonNegativeLong());
284290
List<Condition<?>> metConditions = Collections.singletonList(condition);
@@ -336,14 +342,16 @@ public void testRolloverClusterStateWithBrokenOlderTsdbDataStream() throws Excep
336342
int numberOfBackingIndices = randomIntBetween(1, 3);
337343
ClusterState clusterState = createClusterState(dataStreamName, numberOfBackingIndices, now, true);
338344
DataStream dataStream = clusterState.metadata().dataStreams().get(dataStreamName);
339-
340345
ThreadPool testThreadPool = new TestThreadPool(getTestName());
346+
final TestTelemetryPlugin telemetryPlugin = new TestTelemetryPlugin();
347+
341348
try {
342349
MetadataRolloverService rolloverService = DataStreamTestHelper.getMetadataRolloverService(
343350
dataStream,
344351
testThreadPool,
345352
Set.of(createSettingsProvider(xContentRegistry())),
346-
xContentRegistry()
353+
xContentRegistry(),
354+
telemetryPlugin.getTelemetryProvider(Settings.EMPTY)
347355
);
348356
MaxDocsCondition condition = new MaxDocsCondition(randomNonNegativeLong());
349357
List<Condition<?>> metConditions = Collections.singletonList(condition);
@@ -417,14 +425,15 @@ public void testRolloverClusterStateWithBrokenTsdbDataStream() throws Exception
417425
int numberOfBackingIndices = randomIntBetween(1, 3);
418426
ClusterState clusterState = createClusterState(dataStreamName, numberOfBackingIndices, now, false);
419427
DataStream dataStream = clusterState.metadata().dataStreams().get(dataStreamName);
420-
428+
final TestTelemetryPlugin telemetryPlugin = new TestTelemetryPlugin();
421429
ThreadPool testThreadPool = new TestThreadPool(getTestName());
422430
try {
423431
MetadataRolloverService rolloverService = DataStreamTestHelper.getMetadataRolloverService(
424432
dataStream,
425433
testThreadPool,
426434
Set.of(createSettingsProvider(xContentRegistry())),
427-
xContentRegistry()
435+
xContentRegistry(),
436+
telemetryPlugin.getTelemetryProvider(Settings.EMPTY)
428437
);
429438
MaxDocsCondition condition = new MaxDocsCondition(randomNonNegativeLong());
430439
List<Condition<?>> metConditions = Collections.singletonList(condition);

server/src/main/java/org/elasticsearch/action/admin/indices/rollover/MetadataRolloverService.java

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.elasticsearch.action.admin.indices.create.CreateIndexClusterStateUpdateRequest;
1414
import org.elasticsearch.action.admin.indices.create.CreateIndexRequest;
1515
import org.elasticsearch.action.datastreams.autosharding.AutoShardingResult;
16+
import org.elasticsearch.action.datastreams.autosharding.AutoShardingType;
1617
import org.elasticsearch.action.support.ActiveShardCount;
1718
import org.elasticsearch.cluster.ClusterState;
1819
import org.elasticsearch.cluster.metadata.AliasAction;
@@ -46,6 +47,8 @@
4647
import org.elasticsearch.indices.SystemIndices;
4748
import org.elasticsearch.snapshots.SnapshotInProgressException;
4849
import org.elasticsearch.snapshots.SnapshotsService;
50+
import org.elasticsearch.telemetry.TelemetryProvider;
51+
import org.elasticsearch.telemetry.metric.MeterRegistry;
4952
import org.elasticsearch.threadpool.ThreadPool;
5053

5154
import java.time.Instant;
@@ -70,15 +73,25 @@ public class MetadataRolloverService {
7073
private static final Logger logger = LogManager.getLogger(MetadataRolloverService.class);
7174
private static final Pattern INDEX_NAME_PATTERN = Pattern.compile("^.*-\\d+$");
7275
private static final List<IndexAbstraction.Type> VALID_ROLLOVER_TARGETS = List.of(ALIAS, DATA_STREAM);
73-
7476
public static final Settings HIDDEN_INDEX_SETTINGS = Settings.builder().put(IndexMetadata.SETTING_INDEX_HIDDEN, true).build();
77+
public static final Map<AutoShardingType, String> AUTO_SHARDING_METRIC_NAMES = Map.of(
78+
AutoShardingType.INCREASE_SHARDS,
79+
"es.auto_sharding.increase_shards.total",
80+
AutoShardingType.DECREASE_SHARDS,
81+
"es.auto_sharding.decrease_shards.total",
82+
AutoShardingType.COOLDOWN_PREVENTED_INCREASE,
83+
"es.auto_sharding.cooldown_prevented_increase.total",
84+
AutoShardingType.COOLDOWN_PREVENTED_DECREASE,
85+
"es.auto_sharding.cooldown_prevented_decrease.total"
86+
);
7587

7688
private final ThreadPool threadPool;
7789
private final MetadataCreateIndexService createIndexService;
7890
private final MetadataIndexAliasesService indexAliasesService;
7991
private final SystemIndices systemIndices;
8092
private final WriteLoadForecaster writeLoadForecaster;
8193
private final ClusterService clusterService;
94+
private final MeterRegistry meterRegistry;
8295

8396
@Inject
8497
public MetadataRolloverService(
@@ -87,14 +100,23 @@ public MetadataRolloverService(
87100
MetadataIndexAliasesService indexAliasesService,
88101
SystemIndices systemIndices,
89102
WriteLoadForecaster writeLoadForecaster,
90-
ClusterService clusterService
103+
ClusterService clusterService,
104+
TelemetryProvider telemetryProvider
91105
) {
92106
this.threadPool = threadPool;
93107
this.createIndexService = createIndexService;
94108
this.indexAliasesService = indexAliasesService;
95109
this.systemIndices = systemIndices;
96110
this.writeLoadForecaster = writeLoadForecaster;
97111
this.clusterService = clusterService;
112+
this.meterRegistry = telemetryProvider.getMeterRegistry();
113+
114+
for (var entry : AUTO_SHARDING_METRIC_NAMES.entrySet()) {
115+
final AutoShardingType type = entry.getKey();
116+
final String metricName = entry.getValue();
117+
final String description = String.format(Locale.ROOT, "auto-sharding %s counter", type.name().toLowerCase(Locale.ROOT));
118+
meterRegistry.registerLongCounter(metricName, description, "unit");
119+
}
98120
}
99121

100122
public record RolloverResult(String rolloverIndexName, String sourceIndexName, ClusterState clusterState) {
@@ -330,6 +352,13 @@ private RolloverResult rolloverDataStream(
330352
(builder, indexMetadata) -> builder.put(dataStream.rolloverFailureStore(indexMetadata.getIndex(), newGeneration))
331353
);
332354
} else {
355+
if (autoShardingResult != null) {
356+
final String metricName = AUTO_SHARDING_METRIC_NAMES.get(autoShardingResult.type());
357+
if (metricName != null) {
358+
meterRegistry.getLongCounter(metricName).increment();
359+
}
360+
}
361+
333362
DataStreamAutoShardingEvent dataStreamAutoShardingEvent = autoShardingResult == null
334363
? dataStream.getAutoShardingEvent()
335364
: switch (autoShardingResult.type()) {

server/src/main/java/org/elasticsearch/cluster/ClusterModule.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
package org.elasticsearch.cluster;
1010

11+
import org.elasticsearch.action.admin.indices.rollover.MetadataRolloverService;
1112
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
1213
import org.elasticsearch.cluster.action.shard.ShardStateAction;
1314
import org.elasticsearch.cluster.metadata.ComponentTemplateMetadata;
@@ -120,6 +121,7 @@ public class ClusterModule extends AbstractModule {
120121
final ShardsAllocator shardsAllocator;
121122
private final ShardRoutingRoleStrategy shardRoutingRoleStrategy;
122123
private final AllocationStatsService allocationStatsService;
124+
private final TelemetryProvider telemetryProvider;
123125

124126
public ClusterModule(
125127
Settings settings,
@@ -157,6 +159,7 @@ public ClusterModule(
157159
);
158160
this.metadataDeleteIndexService = new MetadataDeleteIndexService(settings, clusterService, allocationService);
159161
this.allocationStatsService = new AllocationStatsService(clusterService, clusterInfoService, shardsAllocator, writeLoadForecaster);
162+
this.telemetryProvider = telemetryProvider;
160163
}
161164

162165
static ShardRoutingRoleStrategy getShardRoutingRoleStrategy(List<ClusterPlugin> clusterPlugins) {
@@ -444,6 +447,8 @@ protected void configure() {
444447
bind(ShardsAllocator.class).toInstance(shardsAllocator);
445448
bind(ShardRoutingRoleStrategy.class).toInstance(shardRoutingRoleStrategy);
446449
bind(AllocationStatsService.class).toInstance(allocationStatsService);
450+
bind(TelemetryProvider.class).toInstance(telemetryProvider);
451+
bind(MetadataRolloverService.class).asEagerSingleton();
447452
}
448453

449454
public void setExistingShardsAllocators(GatewayAllocator gatewayAllocator) {

0 commit comments

Comments
 (0)