Semantic Text Chunking Indexing Pressure (#125517)

Mikep86 · web-flow · commit 85713f78e006 · 2025-04-14T15:55:37.000-04:00
We have observed many OOMs due to the memory required to inject chunked inference results for semantic_text fields. This PR uses coordinating indexing pressure to account for this memory usage. When indexing pressure memory usage exceeds the threshold set by indexing_pressure.memory.limit, chunked inference result injection will be suspended to prevent OOMs.
diff --git a/docs/changelog/125517.yaml b/docs/changelog/125517.yaml
@@ -0,0 +1,5 @@
+pr: 125517
+summary: Semantic Text Chunking Indexing Pressure
+area: Machine Learning
+type: enhancement
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/index/IndexingPressure.java b/server/src/main/java/org/elasticsearch/index/IndexingPressure.java
@@ -157,11 +157,15 @@ public Incremental startIncrementalCoordinating(int operations, long bytes, bool
     }
 
     public Coordinating markCoordinatingOperationStarted(int operations, long bytes, boolean forceExecution) {
-        Coordinating coordinating = new Coordinating(forceExecution);
+        Coordinating coordinating = createCoordinatingOperation(forceExecution);
         coordinating.increment(operations, bytes);
         return coordinating;
     }
 
+    public Coordinating createCoordinatingOperation(boolean forceExecution) {
+        return new Coordinating(forceExecution);
+    }
+
     public class Incremental implements Releasable {
 
         private final AtomicBoolean closed = new AtomicBoolean();
@@ -254,7 +258,7 @@ public Coordinating(boolean forceExecution) {
             this.forceExecution = forceExecution;
         }
 
-        private void increment(int operations, long bytes) {
+        public void increment(int operations, long bytes) {
             assert closed.get() == false;
             long combinedBytes = currentCombinedCoordinatingAndPrimaryBytes.addAndGet(bytes);
             long replicaWriteBytes = currentReplicaBytes.get();
diff --git a/server/src/main/java/org/elasticsearch/node/NodeConstruction.java b/server/src/main/java/org/elasticsearch/node/NodeConstruction.java
@@ -928,6 +928,8 @@ public Map<String, String> queryFields() {
             metadataCreateIndexService
         );
 
+        final IndexingPressure indexingLimits = new IndexingPressure(settings);
+
         PluginServiceInstances pluginServices = new PluginServiceInstances(
             client,
             clusterService,
@@ -950,7 +952,8 @@ public Map<String, String> queryFields() {
             documentParsingProvider,
             taskManager,
             projectResolver,
-            slowLogFieldProvider
+            slowLogFieldProvider,
+            indexingLimits
         );
 
         Collection<?> pluginComponents = pluginsService.flatMap(plugin -> {
@@ -983,7 +986,6 @@ public Map<String, String> queryFields() {
             .map(TerminationHandlerProvider::handler);
         terminationHandler = getSinglePlugin(terminationHandlers, TerminationHandler.class).orElse(null);
 
-        final IndexingPressure indexingLimits = new IndexingPressure(settings);
         final IncrementalBulkService incrementalBulkService = new IncrementalBulkService(client, indexingLimits);
 
         final ResponseCollectorService responseCollectorService = new ResponseCollectorService(clusterService);
diff --git a/server/src/main/java/org/elasticsearch/node/PluginServiceInstances.java b/server/src/main/java/org/elasticsearch/node/PluginServiceInstances.java
@@ -20,6 +20,7 @@
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.env.NodeEnvironment;
 import org.elasticsearch.features.FeatureService;
+import org.elasticsearch.index.IndexingPressure;
 import org.elasticsearch.index.SlowLogFieldProvider;
 import org.elasticsearch.indices.IndicesService;
 import org.elasticsearch.indices.SystemIndices;
@@ -55,5 +56,6 @@ public record PluginServiceInstances(
     DocumentParsingProvider documentParsingProvider,
     TaskManager taskManager,
     ProjectResolver projectResolver,
-    SlowLogFieldProvider slowLogFieldProvider
+    SlowLogFieldProvider slowLogFieldProvider,
+    IndexingPressure indexingPressure
 ) implements Plugin.PluginServices {}
diff --git a/server/src/main/java/org/elasticsearch/plugins/Plugin.java b/server/src/main/java/org/elasticsearch/plugins/Plugin.java
@@ -29,6 +29,7 @@
 import org.elasticsearch.features.FeatureService;
 import org.elasticsearch.index.IndexModule;
 import org.elasticsearch.index.IndexSettingProvider;
+import org.elasticsearch.index.IndexingPressure;
 import org.elasticsearch.index.SlowLogFieldProvider;
 import org.elasticsearch.indices.IndicesService;
 import org.elasticsearch.indices.SystemIndices;
@@ -186,6 +187,11 @@ public interface PluginServices {
          * Provider for additional SlowLog fields
          */
         SlowLogFieldProvider slowLogFieldProvider();
+
+        /**
+         * Provider for indexing pressure
+         */
+        IndexingPressure indexingPressure();
     }
 
     /**
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java
@@ -321,7 +321,13 @@ public Collection<?> createComponents(PluginServices services) {
         }
         inferenceServiceRegistry.set(serviceRegistry);
 
-        var actionFilter = new ShardBulkInferenceActionFilter(services.clusterService(), serviceRegistry, modelRegistry, getLicenseState());
+        var actionFilter = new ShardBulkInferenceActionFilter(
+            services.clusterService(),
+            serviceRegistry,
+            modelRegistry,
+            getLicenseState(),
+            services.indexingPressure()
+        );
         shardBulkInferenceActionFilter.set(actionFilter);
 
         var meterRegistry = services.telemetryProvider().getMeterRegistry();
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java
@@ -29,11 +29,13 @@
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.util.concurrent.AtomicArray;
+import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
 import org.elasticsearch.common.xcontent.XContentHelper;
 import org.elasticsearch.common.xcontent.support.XContentMapValues;
 import org.elasticsearch.core.Nullable;
 import org.elasticsearch.core.Releasable;
 import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.index.IndexingPressure;
 import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper;
 import org.elasticsearch.inference.ChunkInferenceInput;
 import org.elasticsearch.inference.ChunkedInference;
@@ -109,18 +111,21 @@ public class ShardBulkInferenceActionFilter implements MappedActionFilter {
     private final InferenceServiceRegistry inferenceServiceRegistry;
     private final ModelRegistry modelRegistry;
     private final XPackLicenseState licenseState;
+    private final IndexingPressure indexingPressure;
     private volatile long batchSizeInBytes;
 
     public ShardBulkInferenceActionFilter(
         ClusterService clusterService,
         InferenceServiceRegistry inferenceServiceRegistry,
         ModelRegistry modelRegistry,
-        XPackLicenseState licenseState
+        XPackLicenseState licenseState,
+        IndexingPressure indexingPressure
     ) {
         this.clusterService = clusterService;
         this.inferenceServiceRegistry = inferenceServiceRegistry;
         this.modelRegistry = modelRegistry;
         this.licenseState = licenseState;
+        this.indexingPressure = indexingPressure;
         this.batchSizeInBytes = INDICES_INFERENCE_BATCH_SIZE.get(clusterService.getSettings()).getBytes();
         clusterService.getClusterSettings().addSettingsUpdateConsumer(INDICES_INFERENCE_BATCH_SIZE, this::setBatchSize);
     }
@@ -146,8 +151,15 @@ public <Request extends ActionRequest, Response extends ActionResponse> void app
             BulkShardRequest bulkShardRequest = (BulkShardRequest) request;
             var fieldInferenceMetadata = bulkShardRequest.consumeInferenceFieldMap();
             if (fieldInferenceMetadata != null && fieldInferenceMetadata.isEmpty() == false) {
-                Runnable onInferenceCompletion = () -> chain.proceed(task, action, request, listener);
-                processBulkShardRequest(fieldInferenceMetadata, bulkShardRequest, onInferenceCompletion);
+                // Maintain coordinating indexing pressure from inference until the indexing operations are complete
+                IndexingPressure.Coordinating coordinatingIndexingPressure = indexingPressure.createCoordinatingOperation(false);
+                Runnable onInferenceCompletion = () -> chain.proceed(
+                    task,
+                    action,
+                    request,
+                    ActionListener.releaseAfter(listener, coordinatingIndexingPressure)
+                );
+                processBulkShardRequest(fieldInferenceMetadata, bulkShardRequest, onInferenceCompletion, coordinatingIndexingPressure);
                 return;
             }
         }
@@ -157,12 +169,14 @@ public <Request extends ActionRequest, Response extends ActionResponse> void app
     private void processBulkShardRequest(
         Map<String, InferenceFieldMetadata> fieldInferenceMap,
         BulkShardRequest bulkShardRequest,
-        Runnable onCompletion
+        Runnable onCompletion,
+        IndexingPressure.Coordinating coordinatingIndexingPressure
     ) {
         final ProjectMetadata project = clusterService.state().getMetadata().getProject();
         var index = project.index(bulkShardRequest.index());
         boolean useLegacyFormat = InferenceMetadataFieldsMapper.isEnabled(index.getSettings()) == false;
-        new AsyncBulkShardInferenceAction(useLegacyFormat, fieldInferenceMap, bulkShardRequest, onCompletion).run();
+        new AsyncBulkShardInferenceAction(useLegacyFormat, fieldInferenceMap, bulkShardRequest, onCompletion, coordinatingIndexingPressure)
+            .run();
     }
 
     private record InferenceProvider(InferenceService service, Model model) {}
@@ -232,18 +246,21 @@ private class AsyncBulkShardInferenceAction implements Runnable {
         private final BulkShardRequest bulkShardRequest;
         private final Runnable onCompletion;
         private final AtomicArray<FieldInferenceResponseAccumulator> inferenceResults;
+        private final IndexingPressure.Coordinating coordinatingIndexingPressure;
 
         private AsyncBulkShardInferenceAction(
             boolean useLegacyFormat,
             Map<String, InferenceFieldMetadata> fieldInferenceMap,
             BulkShardRequest bulkShardRequest,
-            Runnable onCompletion
+            Runnable onCompletion,
+            IndexingPressure.Coordinating coordinatingIndexingPressure
         ) {
             this.useLegacyFormat = useLegacyFormat;
             this.fieldInferenceMap = fieldInferenceMap;
             this.bulkShardRequest = bulkShardRequest;
             this.inferenceResults = new AtomicArray<>(bulkShardRequest.items().length);
             this.onCompletion = onCompletion;
+            this.coordinatingIndexingPressure = coordinatingIndexingPressure;
         }
 
         @Override
@@ -431,9 +448,9 @@ public void onFailure(Exception exc) {
          */
         private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<String, List<FieldInferenceRequest>> requestsMap) {
             boolean isUpdateRequest = false;
-            final IndexRequest indexRequest;
+            final IndexRequestWithIndexingPressure indexRequest;
             if (item.request() instanceof IndexRequest ir) {
-                indexRequest = ir;
+                indexRequest = new IndexRequestWithIndexingPressure(ir);
             } else if (item.request() instanceof UpdateRequest updateRequest) {
                 isUpdateRequest = true;
                 if (updateRequest.script() != null) {
@@ -447,13 +464,13 @@ private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<
                     );
                     return 0;
                 }
-                indexRequest = updateRequest.doc();
+                indexRequest = new IndexRequestWithIndexingPressure(updateRequest.doc());
             } else {
                 // ignore delete request
                 return 0;
             }
 
-            final Map<String, Object> docMap = indexRequest.sourceAsMap();
+            final Map<String, Object> docMap = indexRequest.getIndexRequest().sourceAsMap();
             long inputLength = 0;
             for (var entry : fieldInferenceMap.values()) {
                 String field = entry.getName();
@@ -489,6 +506,10 @@ private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<
                          * This ensures that the field is treated as intentionally cleared,
                          * preventing any unintended carryover of prior inference results.
                          */
+                        if (incrementIndexingPressure(indexRequest, itemIndex) == false) {
+                            return inputLength;
+                        }
+
                         var slot = ensureResponseAccumulatorSlot(itemIndex);
                         slot.addOrUpdateResponse(
                             new FieldInferenceResponse(field, sourceField, null, order++, 0, null, EMPTY_CHUNKED_INFERENCE)
@@ -510,6 +531,7 @@ private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<
                         }
                         continue;
                     }
+
                     var slot = ensureResponseAccumulatorSlot(itemIndex);
                     final List<String> values;
                     try {
@@ -527,7 +549,10 @@ private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<
                     List<FieldInferenceRequest> requests = requestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>());
                     int offsetAdjustment = 0;
                     for (String v : values) {
-                        inputLength += v.length();
+                        if (incrementIndexingPressure(indexRequest, itemIndex) == false) {
+                            return inputLength;
+                        }
+
                         if (v.isBlank()) {
                             slot.addOrUpdateResponse(
                                 new FieldInferenceResponse(field, sourceField, v, order++, 0, null, EMPTY_CHUNKED_INFERENCE)
@@ -536,6 +561,7 @@ private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<
                             requests.add(
                                 new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment, chunkingSettings)
                             );
+                            inputLength += v.length();
                         }
 
                         // When using the inference metadata fields format, all the input values are concatenated so that the
@@ -545,9 +571,54 @@ private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<
                     }
                 }
             }
+
             return inputLength;
         }
 
+        private static class IndexRequestWithIndexingPressure {
+            private final IndexRequest indexRequest;
+            private boolean indexingPressureIncremented;
+
+            private IndexRequestWithIndexingPressure(IndexRequest indexRequest) {
+                this.indexRequest = indexRequest;
+                this.indexingPressureIncremented = false;
+            }
+
+            private IndexRequest getIndexRequest() {
+                return indexRequest;
+            }
+
+            private boolean isIndexingPressureIncremented() {
+                return indexingPressureIncremented;
+            }
+
+            private void setIndexingPressureIncremented() {
+                this.indexingPressureIncremented = true;
+            }
+        }
+
+        private boolean incrementIndexingPressure(IndexRequestWithIndexingPressure indexRequest, int itemIndex) {
+            boolean success = true;
+            if (indexRequest.isIndexingPressureIncremented() == false) {
+                try {
+                    // Track operation count as one operation per document source update
+                    coordinatingIndexingPressure.increment(1, indexRequest.getIndexRequest().source().ramBytesUsed());
+                    indexRequest.setIndexingPressureIncremented();
+                } catch (EsRejectedExecutionException e) {
+                    addInferenceResponseFailure(
+                        itemIndex,
+                        new InferenceException(
+                            "Insufficient memory available to update source on document [" + indexRequest.getIndexRequest().id() + "]",
+                            e
+                        )
+                    );
+                    success = false;
+                }
+            }
+
+            return success;
+        }
+
         private FieldInferenceResponseAccumulator ensureResponseAccumulatorSlot(int id) {
             FieldInferenceResponseAccumulator acc = inferenceResults.get(id);
             if (acc == null) {
@@ -624,6 +695,7 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons
                 inferenceFieldsMap.put(fieldName, result);
             }
 
+            BytesReference originalSource = indexRequest.source();
             if (useLegacyFormat) {
                 var newDocMap = indexRequest.sourceAsMap();
                 for (var entry : inferenceFieldsMap.entrySet()) {
@@ -636,6 +708,23 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons
                     indexRequest.source(builder);
                 }
             }
+            long modifiedSourceSize = indexRequest.source().ramBytesUsed();
+
+            // Add the indexing pressure from the source modifications.
+            // Don't increment operation count because we count one source update as one operation, and we already accounted for those
+            // in addFieldInferenceRequests.
+            try {
+                coordinatingIndexingPressure.increment(0, modifiedSourceSize - originalSource.ramBytesUsed());
+            } catch (EsRejectedExecutionException e) {
+                indexRequest.source(originalSource, indexRequest.getContentType());
+                item.abort(
+                    item.index(),
+                    new InferenceException(
+                        "Insufficient memory available to insert inference results into document [" + indexRequest.id() + "]",
+                        e
+                    )
+                );
+            }
         }
     }
 
diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java
diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldTests.java