Skip to content

Commit 3f49eef

Browse files
[FEATURE][ML] Write data frame configuration to process (#35914)
1 parent 6e3f832 commit 3f49eef

9 files changed

+180
-21
lines changed

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ public Collection<Object> createComponents(Client client, ClusterService cluster
406406
new BlackHoleAutodetectProcess(job.getId());
407407
// factor of 1.0 makes renormalization a no-op
408408
normalizerProcessFactory = (jobId, quantilesState, bucketSpan, executorService) -> new MultiplyingNormalizerProcess(1.0);
409-
analyticsProcessFactory = (jobId, executorService) -> null;
409+
analyticsProcessFactory = (jobId, analyticsProcessConfig, executorService) -> null;
410410
}
411411
NormalizerFactory normalizerFactory = new NormalizerFactory(normalizerProcessFactory,
412412
threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME));

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportRunAnalyticsAction.java

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import org.elasticsearch.action.admin.indices.create.CreateIndexAction;
1111
import org.elasticsearch.action.admin.indices.create.CreateIndexRequest;
1212
import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
13+
import org.elasticsearch.action.admin.indices.refresh.RefreshAction;
14+
import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
1315
import org.elasticsearch.action.support.ActionFilters;
1416
import org.elasticsearch.action.support.HandledTransportAction;
1517
import org.elasticsearch.action.support.master.AcknowledgedResponse;
@@ -25,6 +27,7 @@
2527
import org.elasticsearch.env.Environment;
2628
import org.elasticsearch.index.IndexNotFoundException;
2729
import org.elasticsearch.index.IndexSortConfig;
30+
import org.elasticsearch.index.reindex.BulkByScrollResponse;
2831
import org.elasticsearch.index.reindex.ReindexAction;
2932
import org.elasticsearch.index.reindex.ReindexRequest;
3033
import org.elasticsearch.script.Script;
@@ -114,18 +117,23 @@ private boolean isMlNode(DiscoveryNode node) {
114117
private void reindexDataframeAndStartAnalysis(String index, ActionListener<AcknowledgedResponse> listener) {
115118
final String destinationIndex = index + "_copy";
116119

120+
ActionListener<BulkByScrollResponse> reindexCompletedListener = ActionListener.wrap(
121+
bulkResponse -> {
122+
client.execute(RefreshAction.INSTANCE, new RefreshRequest(destinationIndex), ActionListener.wrap(
123+
refreshResponse -> {
124+
runPipelineAnalytics(destinationIndex, listener);
125+
}, listener::onFailure
126+
));
127+
}, listener::onFailure
128+
);
129+
117130
ActionListener<CreateIndexResponse> copyIndexCreatedListener = ActionListener.wrap(
118131
createIndexResponse -> {
119132
ReindexRequest reindexRequest = new ReindexRequest();
120133
reindexRequest.setSourceIndices(index);
121134
reindexRequest.setDestIndex(destinationIndex);
122135
reindexRequest.setScript(new Script("ctx._source." + DataFrameFields.ID + " = ctx._id"));
123-
client.execute(ReindexAction.INSTANCE, reindexRequest, ActionListener.wrap(
124-
bulkResponse -> {
125-
runPipelineAnalytics(destinationIndex, listener);
126-
},
127-
listener::onFailure
128-
));
136+
client.execute(ReindexAction.INSTANCE, reindexRequest, reindexCompletedListener);
129137
}, listener::onFailure
130138
);
131139

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.ml.analytics;
7+
8+
import org.elasticsearch.common.ParseField;
9+
import org.elasticsearch.common.xcontent.ToXContentObject;
10+
import org.elasticsearch.common.xcontent.XContentBuilder;
11+
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
12+
13+
import java.io.IOException;
14+
15+
public class DataFrameAnalysis implements ToXContentObject {
16+
17+
private static final ParseField NAME = new ParseField("name");
18+
19+
private final String name;
20+
21+
public DataFrameAnalysis(String name) {
22+
this.name = ExceptionsHelper.requireNonNull(name, NAME.getPreferredName());
23+
}
24+
25+
@Override
26+
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
27+
builder.startObject();
28+
builder.field(NAME.getPreferredName(), name);
29+
builder.endObject();
30+
return builder;
31+
}
32+
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/analytics/DataFrameDataExtractor.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,4 +189,25 @@ public String[] getFieldNamesArray() {
189189
List<String> fieldNames = getFieldNames();
190190
return fieldNames.toArray(new String[fieldNames.size()]);
191191
}
192+
193+
public DataSummary collectDataSummary() {
194+
SearchRequestBuilder searchRequestBuilder = new SearchRequestBuilder(client, SearchAction.INSTANCE)
195+
.setIndices(context.indices)
196+
.setSize(0)
197+
.setQuery(context.query);
198+
199+
SearchResponse searchResponse = executeSearchRequest(searchRequestBuilder);
200+
return new DataSummary(searchResponse.getHits().getTotalHits(), context.extractedFields.getAllFields().size());
201+
}
202+
203+
public static class DataSummary {
204+
205+
public final long rows;
206+
public final long cols;
207+
208+
public DataSummary(long rows, long cols) {
209+
this.rows = rows;
210+
this.cols = cols;
211+
}
212+
}
192213
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/analytics/process/AnalyticsBuilder.java

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,19 @@
55
*/
66
package org.elasticsearch.xpack.ml.analytics.process;
77

8+
import org.elasticsearch.common.Strings;
9+
import org.elasticsearch.common.xcontent.ToXContent;
10+
import org.elasticsearch.common.xcontent.XContentBuilder;
11+
import org.elasticsearch.common.xcontent.json.JsonXContent;
12+
import org.elasticsearch.env.Environment;
813
import org.elasticsearch.xpack.ml.process.NativeController;
914
import org.elasticsearch.xpack.ml.process.ProcessPipes;
1015

1116
import java.io.IOException;
17+
import java.io.OutputStreamWriter;
18+
import java.nio.charset.StandardCharsets;
19+
import java.nio.file.Files;
20+
import java.nio.file.Path;
1221
import java.util.ArrayList;
1322
import java.util.List;
1423
import java.util.Objects;
@@ -19,13 +28,21 @@ public class AnalyticsBuilder {
1928
private static final String ANALYTICS_PATH = "./" + ANALYTICS;
2029

2130
private static final String LENGTH_ENCODED_INPUT_ARG = "--lengthEncodedInput";
31+
private static final String CONFIG_ARG = "--config=";
2232

33+
private final Environment env;
2334
private final NativeController nativeController;
2435
private final ProcessPipes processPipes;
36+
private final AnalyticsProcessConfig config;
37+
private final List<Path> filesToDelete;
2538

26-
public AnalyticsBuilder(NativeController nativeController, ProcessPipes processPipes) {
39+
public AnalyticsBuilder(Environment env, NativeController nativeController, ProcessPipes processPipes, AnalyticsProcessConfig config,
40+
List<Path> filesToDelete) {
41+
this.env = Objects.requireNonNull(env);
2742
this.nativeController = Objects.requireNonNull(nativeController);
2843
this.processPipes = Objects.requireNonNull(processPipes);
44+
this.config = Objects.requireNonNull(config);
45+
this.filesToDelete = Objects.requireNonNull(filesToDelete);
2946
}
3047

3148
public void build() throws IOException {
@@ -34,10 +51,24 @@ public void build() throws IOException {
3451
nativeController.startProcess(command);
3552
}
3653

37-
List<String> buildAnalyticsCommand() {
54+
List<String> buildAnalyticsCommand() throws IOException {
3855
List<String> command = new ArrayList<>();
3956
command.add(ANALYTICS_PATH);
4057
command.add(LENGTH_ENCODED_INPUT_ARG);
58+
addConfigFile(command);
4159
return command;
4260
}
61+
62+
private void addConfigFile(List<String> command) throws IOException {
63+
Path configFile = Files.createTempFile(env.tmpFile(), "analysis", ".conf");
64+
filesToDelete.add(configFile);
65+
try (OutputStreamWriter osw = new OutputStreamWriter(Files.newOutputStream(configFile),StandardCharsets.UTF_8);
66+
XContentBuilder jsonBuilder = JsonXContent.contentBuilder()) {
67+
68+
config.toXContent(jsonBuilder, ToXContent.EMPTY_PARAMS);
69+
osw.write(Strings.toString(jsonBuilder));
70+
}
71+
72+
command.add(CONFIG_ARG + configFile.toString());
73+
}
4374
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
package org.elasticsearch.xpack.ml.analytics.process;
7+
8+
import org.elasticsearch.common.unit.ByteSizeValue;
9+
import org.elasticsearch.common.xcontent.ToXContentObject;
10+
import org.elasticsearch.common.xcontent.XContentBuilder;
11+
import org.elasticsearch.xpack.ml.analytics.DataFrameAnalysis;
12+
13+
import java.io.IOException;
14+
import java.util.Objects;
15+
16+
public class AnalyticsProcessConfig implements ToXContentObject {
17+
18+
private static final String ROWS = "rows";
19+
private static final String COLS = "cols";
20+
private static final String MEMORY_LIMIT = "memory_limit";
21+
private static final String THREADS = "threads";
22+
private static final String ANALYSIS = "analysis";
23+
24+
private final long rows;
25+
private final long cols;
26+
private final ByteSizeValue memoryLimit;
27+
private final int threads;
28+
private final DataFrameAnalysis analysis;
29+
30+
31+
public AnalyticsProcessConfig(long rows, long cols, ByteSizeValue memoryLimit, int threads, DataFrameAnalysis analysis) {
32+
this.rows = rows;
33+
this.cols = cols;
34+
this.memoryLimit = Objects.requireNonNull(memoryLimit);
35+
this.threads = threads;
36+
this.analysis = Objects.requireNonNull(analysis);
37+
}
38+
39+
@Override
40+
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
41+
builder.startObject();
42+
builder.field(ROWS, rows);
43+
builder.field(COLS, cols);
44+
builder.field(MEMORY_LIMIT, memoryLimit.getBytes());
45+
builder.field(THREADS, threads);
46+
builder.field(ANALYSIS, analysis);
47+
builder.endObject();
48+
return builder;
49+
}
50+
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/analytics/process/AnalyticsProcessFactory.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ public interface AnalyticsProcessFactory {
1313
* Create an implementation of {@link AnalyticsProcess}
1414
*
1515
* @param jobId The job id
16+
* @param analyticsProcessConfig The process configuration
1617
* @param executorService Executor service used to start the async tasks a job needs to operate the analytical process
1718
* @return The process
1819
*/
19-
AnalyticsProcess createAnalyticsProcess(String jobId, ExecutorService executorService);
20+
AnalyticsProcess createAnalyticsProcess(String jobId, AnalyticsProcessConfig analyticsProcessConfig, ExecutorService executorService);
2021
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/analytics/process/AnalyticsProcessManager.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,13 @@
88
import org.apache.logging.log4j.LogManager;
99
import org.apache.logging.log4j.Logger;
1010
import org.elasticsearch.client.Client;
11+
import org.elasticsearch.common.unit.ByteSizeUnit;
12+
import org.elasticsearch.common.unit.ByteSizeValue;
1113
import org.elasticsearch.env.Environment;
1214
import org.elasticsearch.threadpool.ThreadPool;
1315
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
1416
import org.elasticsearch.xpack.ml.MachineLearning;
17+
import org.elasticsearch.xpack.ml.analytics.DataFrameAnalysis;
1518
import org.elasticsearch.xpack.ml.analytics.DataFrameDataExtractor;
1619

1720
import java.io.IOException;
@@ -39,7 +42,7 @@ public AnalyticsProcessManager(Client client, Environment environment, ThreadPoo
3942

4043
public void processData(String jobId, DataFrameDataExtractor dataExtractor) {
4144
threadPool.generic().execute(() -> {
42-
AnalyticsProcess process = createProcess(jobId);
45+
AnalyticsProcess process = createProcess(jobId, dataExtractor);
4346
try {
4447
// Fake header
4548
process.writeRecord(dataExtractor.getFieldNamesArray());
@@ -69,13 +72,20 @@ public void processData(String jobId, DataFrameDataExtractor dataExtractor) {
6972
});
7073
}
7174

72-
private AnalyticsProcess createProcess(String jobId) {
75+
private AnalyticsProcess createProcess(String jobId, DataFrameDataExtractor dataExtractor) {
7376
// TODO We should rename the thread pool to reflect its more general use now, e.g. JOB_PROCESS_THREAD_POOL_NAME
7477
ExecutorService executorService = threadPool.executor(MachineLearning.AUTODETECT_THREAD_POOL_NAME);
75-
AnalyticsProcess process = processFactory.createAnalyticsProcess(jobId, executorService);
78+
AnalyticsProcess process = processFactory.createAnalyticsProcess(jobId, createProcessConfig(dataExtractor), executorService);
7679
if (process.isProcessAlive() == false) {
7780
throw ExceptionsHelper.serverError("Failed to start analytics process");
7881
}
7982
return process;
8083
}
84+
85+
private AnalyticsProcessConfig createProcessConfig(DataFrameDataExtractor dataExtractor) {
86+
DataFrameDataExtractor.DataSummary dataSummary = dataExtractor.collectDataSummary();
87+
AnalyticsProcessConfig config = new AnalyticsProcessConfig(dataSummary.rows, dataSummary.cols,
88+
new ByteSizeValue(1, ByteSizeUnit.GB), 1, new DataFrameAnalysis("outliers"));
89+
return config;
90+
}
8191
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/analytics/process/NativeAnalyticsProcessFactory.java

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@
1616
import org.elasticsearch.xpack.ml.utils.NamedPipeHelper;
1717

1818
import java.io.IOException;
19+
import java.nio.file.Path;
1920
import java.time.Duration;
20-
import java.util.Collections;
21+
import java.util.ArrayList;
22+
import java.util.List;
2123
import java.util.Objects;
2224
import java.util.concurrent.ExecutorService;
2325

@@ -37,15 +39,17 @@ public NativeAnalyticsProcessFactory(Environment env, NativeController nativeCon
3739
}
3840

3941
@Override
40-
public AnalyticsProcess createAnalyticsProcess(String jobId, ExecutorService executorService) {
42+
public AnalyticsProcess createAnalyticsProcess(String jobId, AnalyticsProcessConfig analyticsProcessConfig,
43+
ExecutorService executorService) {
44+
List<Path> filesToDelete = new ArrayList<>();
4145
ProcessPipes processPipes = new ProcessPipes(env, NAMED_PIPE_HELPER, AnalyticsBuilder.ANALYTICS, jobId,
42-
true, false, true, true, false, false);
46+
true, false, true, true, false, false);
4347

44-
createNativeProcess(jobId, processPipes);
48+
createNativeProcess(jobId, analyticsProcessConfig, filesToDelete, processPipes);
4549

4650
NativeAnalyticsProcess analyticsProcess = new NativeAnalyticsProcess(jobId, processPipes.getLogStream().get(),
47-
processPipes.getProcessInStream().get(), processPipes.getProcessOutStream().get(), null, 0,
48-
Collections.emptyList(), () -> {});
51+
processPipes.getProcessInStream().get(), processPipes.getProcessOutStream().get(), null, 0,
52+
filesToDelete, () -> {});
4953

5054

5155
try {
@@ -61,8 +65,10 @@ public AnalyticsProcess createAnalyticsProcess(String jobId, ExecutorService exe
6165
}
6266
}
6367

64-
private void createNativeProcess(String jobId, ProcessPipes processPipes) {
65-
AnalyticsBuilder analyticsBuilder = new AnalyticsBuilder(nativeController, processPipes);
68+
private void createNativeProcess(String jobId, AnalyticsProcessConfig analyticsProcessConfig, List<Path> filesToDelete,
69+
ProcessPipes processPipes) {
70+
AnalyticsBuilder analyticsBuilder = new AnalyticsBuilder(env, nativeController, processPipes, analyticsProcessConfig,
71+
filesToDelete);
6672
try {
6773
analyticsBuilder.build();
6874
processPipes.connectStreams(PROCESS_STARTUP_TIMEOUT);

0 commit comments

Comments
 (0)