Skip to content

Commit 6c313a9

Browse files
author
Hendrik Muhs
committed
This implementation lazily (on 1st forecast request) checks for available
diskspace and creates a subfolder for storing data outside of Lucene indexes, but as part of the ES data paths. Details: - tmp storage is managed and does not allow allocation if disk space is below a threshold (5GB at the moment) - tmp storage is supposed to be managed by the native component but in case this fails cleanup is provided: - on job close - on process crash - after node crash, on restart - available space is re-checked for every forecast call (the native component has to check again before writing) Note: The 1st path that has enough space is chosen on job open (job close/reopen triggers a new search)
1 parent b5a793b commit 6c313a9

File tree

10 files changed

+406
-26
lines changed

10 files changed

+406
-26
lines changed

x-pack/docs/en/ml/forecasting.asciidoc

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,7 @@ For more information about any of these functions, see <<ml-functions>>.
5959
* Forecasts run concurrently with real-time {ml} analysis. That is to say, {ml}
6060
analysis does not stop while forecasts are generated. Forecasts can have an
6161
impact on {ml} jobs, however, especially in terms of memory usage. For this
62-
reason, forecasts run only if the model memory status is acceptable and the
63-
snapshot models for the forecast do not require more than 20 MB. If these memory
64-
limits are reached, consider splitting the job into multiple smaller jobs and
65-
creating forecasts for these.
62+
reason, forecasts run only if the model memory status is acceptable.
6663
* The job must be open when you create a forecast. Otherwise, an error occurs.
6764
* If there is insufficient data to generate any meaningful predictions, an
6865
error occurs. In general, forecasts that are created early in the learning phase

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,8 @@ public List<Setting<?>> getSettings() {
286286
DataCountsReporter.ACCEPTABLE_PERCENTAGE_DATE_PARSE_ERRORS_SETTING,
287287
DataCountsReporter.ACCEPTABLE_PERCENTAGE_OUT_OF_ORDER_ERRORS_SETTING,
288288
AutodetectProcessManager.MAX_RUNNING_JOBS_PER_NODE,
289-
AutodetectProcessManager.MAX_OPEN_JOBS_PER_NODE));
289+
AutodetectProcessManager.MAX_OPEN_JOBS_PER_NODE,
290+
AutodetectProcessManager.MIN_DISK_SPACE_OFF_HEAP));
290291
}
291292

292293
public Settings additionalSettings() {
@@ -403,6 +404,9 @@ public Collection<Object> createComponents(Client client, ClusterService cluster
403404
// This object's constructor attaches to the license state, so there's no need to retain another reference to it
404405
new InvalidLicenseEnforcer(settings, getLicenseState(), threadPool, datafeedManager, autodetectProcessManager);
405406

407+
// run node startup tasks
408+
autodetectProcessManager.onNodeStartup();
409+
406410
return Arrays.asList(
407411
mlLifeCycleService,
408412
jobProvider,

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportForecastJobAction.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import org.elasticsearch.common.inject.Inject;
1616
import org.elasticsearch.common.io.stream.StreamInput;
1717
import org.elasticsearch.common.settings.Settings;
18+
import org.elasticsearch.common.unit.ByteSizeUnit;
19+
import org.elasticsearch.common.unit.ByteSizeValue;
1820
import org.elasticsearch.common.unit.TimeValue;
1921
import org.elasticsearch.threadpool.ThreadPool;
2022
import org.elasticsearch.transport.TransportService;
@@ -28,6 +30,7 @@
2830
import org.elasticsearch.xpack.ml.job.process.autodetect.params.ForecastParams;
2931

3032
import java.io.IOException;
33+
import java.nio.file.Path;
3134
import java.util.List;
3235
import java.util.function.Consumer;
3336

@@ -36,6 +39,8 @@
3639
public class TransportForecastJobAction extends TransportJobTaskAction<ForecastJobAction.Request,
3740
ForecastJobAction.Response> {
3841

42+
private static final ByteSizeValue FORECAST_LOCAL_STORAGE_LIMIT = new ByteSizeValue(500, ByteSizeUnit.MB);
43+
3944
private final JobProvider jobProvider;
4045
@Inject
4146
public TransportForecastJobAction(Settings settings, TransportService transportService, ThreadPool threadPool,
@@ -73,6 +78,13 @@ protected void taskOperation(ForecastJobAction.Request request, TransportOpenJob
7378
paramsBuilder.expiresIn(request.getExpiresIn());
7479
}
7580

81+
// tmp storage might be null, we do not log here, because it might not be
82+
// required
83+
Path tmpStorage = processManager.tryGetTmpStorage(task, FORECAST_LOCAL_STORAGE_LIMIT);
84+
if (tmpStorage != null) {
85+
paramsBuilder.tmpStorage(tmpStorage.toString());
86+
}
87+
7688
ForecastParams params = paramsBuilder.build();
7789
processManager.forecastJob(task, params, e -> {
7890
if (e == null) {
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License;
4+
* you may not use this file except in compliance with the Elastic License.
5+
*/
6+
7+
package org.elasticsearch.xpack.ml.job.process;
8+
9+
import org.apache.logging.log4j.Logger;
10+
import org.elasticsearch.common.logging.Loggers;
11+
import org.elasticsearch.common.unit.ByteSizeValue;
12+
import org.elasticsearch.core.internal.io.IOUtils;
13+
import org.elasticsearch.env.Environment;
14+
15+
import java.io.IOException;
16+
import java.nio.file.Files;
17+
import java.nio.file.Path;
18+
19+
/**
20+
* Provide storage for native components.
21+
*/
22+
public class NativeStorageProvider {
23+
24+
private static final Logger LOGGER = Loggers.getLogger(NativeStorageProvider.class);
25+
26+
27+
private static final String LOCAL_STORAGE_SUBFOLDER = "ml-local-data";
28+
private static final String LOCAL_STORAGE_TMP_FOLDER = "tmp";
29+
30+
private final Environment environment;
31+
32+
// do not allow any usage below this threshold
33+
private final ByteSizeValue minLocalStorageAvailable;
34+
35+
public NativeStorageProvider(Environment environment, ByteSizeValue minDiskSpaceOffHeap) {
36+
this.environment = environment;
37+
this.minLocalStorageAvailable = minDiskSpaceOffHeap;
38+
}
39+
40+
/**
41+
* Removes any temporary storage leftovers.
42+
*
43+
* Removes all temp files and folder which might be there as a result of an
44+
* unclean node shutdown or broken clients.
45+
*
46+
* Do not call while there are running jobs.
47+
*
48+
* @throws IOException if cleanup fails
49+
*/
50+
public void cleanupLocalTmpStorageInCaseOfUncleanShutdown() throws IOException {
51+
for (Path p : environment.dataFiles()) {
52+
IOUtils.rm(p.resolve(LOCAL_STORAGE_SUBFOLDER).resolve(LOCAL_STORAGE_TMP_FOLDER));
53+
}
54+
}
55+
56+
/**
57+
* Tries to find local storage for storing temporary data.
58+
*
59+
* @param uniqueIdentifier An identifier to be used as sub folder
60+
* @param requestedSize The maximum size required
61+
* @return Path for temporary storage if available, null otherwise
62+
*/
63+
public Path tryGetLocalTmpStorage(String uniqueIdentifier, ByteSizeValue requestedSize) {
64+
for (Path path : environment.dataFiles()) {
65+
try {
66+
if (getUsableSpace(path) >= requestedSize.getBytes() + minLocalStorageAvailable.getBytes()) {
67+
Path tmpDirectory = path.resolve(LOCAL_STORAGE_SUBFOLDER).resolve(LOCAL_STORAGE_TMP_FOLDER).resolve(uniqueIdentifier);
68+
Files.createDirectories(tmpDirectory);
69+
return tmpDirectory;
70+
}
71+
} catch (IOException e) {
72+
LOGGER.debug("Failed to obtain information about path [{}]: {}", path, e);
73+
}
74+
75+
}
76+
LOGGER.debug("Failed to find native storage for [{}], returning null", uniqueIdentifier);
77+
return null;
78+
}
79+
80+
public boolean localTmpStorageHasEnoughSpace(Path path, ByteSizeValue requestedSize) {
81+
Path realPath = path.toAbsolutePath();
82+
for (Path p : environment.dataFiles()) {
83+
try {
84+
if (realPath.startsWith(p.resolve(LOCAL_STORAGE_SUBFOLDER).resolve(LOCAL_STORAGE_TMP_FOLDER))) {
85+
return getUsableSpace(p) >= requestedSize.getBytes() + minLocalStorageAvailable.getBytes();
86+
}
87+
} catch (IOException e) {
88+
LOGGER.debug("Failed to optain information about path [{}]: {}", path, e);
89+
}
90+
}
91+
92+
LOGGER.debug("Not enough space left for path [{}]", path);
93+
return false;
94+
}
95+
96+
/**
97+
* Delete temporary storage, previously allocated
98+
*
99+
* @param path
100+
* Path to temporary storage
101+
* @throws IOException
102+
* if path can not be cleaned up
103+
*/
104+
public void cleanupLocalTmpStorage(Path path) throws IOException {
105+
// do not allow to breakout from the tmp storage provided
106+
Path realPath = path.toAbsolutePath();
107+
for (Path p : environment.dataFiles()) {
108+
if (realPath.startsWith(p.resolve(LOCAL_STORAGE_SUBFOLDER).resolve(LOCAL_STORAGE_TMP_FOLDER))) {
109+
IOUtils.rm(path);
110+
}
111+
}
112+
}
113+
114+
long getUsableSpace(Path path) throws IOException {
115+
long freeSpaceInBytes = Environment.getFileStore(path).getUsableSpace();
116+
117+
/* See: https://bugs.openjdk.java.net/browse/JDK-8162520 */
118+
if (freeSpaceInBytes < 0) {
119+
freeSpaceInBytes = Long.MAX_VALUE;
120+
}
121+
return freeSpaceInBytes;
122+
}
123+
}

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessManager.java

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import org.elasticsearch.common.xcontent.XContentElasticsearchExtension;
99
import org.elasticsearch.core.internal.io.IOUtils;
10+
import org.apache.logging.log4j.message.ParameterizedMessage;
1011
import org.elasticsearch.ElasticsearchStatusException;
1112
import org.elasticsearch.action.ActionListener;
1213
import org.elasticsearch.client.Client;
@@ -15,11 +16,12 @@
1516
import org.elasticsearch.common.component.AbstractComponent;
1617
import org.elasticsearch.common.settings.Setting;
1718
import org.elasticsearch.common.settings.Settings;
19+
import org.elasticsearch.common.unit.ByteSizeUnit;
20+
import org.elasticsearch.common.unit.ByteSizeValue;
1821
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
1922
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
2023
import org.elasticsearch.common.util.concurrent.ThreadContext;
2124
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
22-
import org.elasticsearch.common.xcontent.XContentBuilder;
2325
import org.elasticsearch.common.xcontent.XContentType;
2426
import org.elasticsearch.env.Environment;
2527
import org.elasticsearch.index.analysis.AnalysisRegistry;
@@ -47,6 +49,7 @@
4749
import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister;
4850
import org.elasticsearch.xpack.ml.job.persistence.StateStreamer;
4951
import org.elasticsearch.xpack.ml.job.process.DataCountsReporter;
52+
import org.elasticsearch.xpack.ml.job.process.NativeStorageProvider;
5053
import org.elasticsearch.xpack.ml.job.process.autodetect.output.AutoDetectResultProcessor;
5154
import org.elasticsearch.xpack.ml.job.process.autodetect.params.DataLoadParams;
5255
import org.elasticsearch.xpack.ml.job.process.autodetect.params.FlushJobParams;
@@ -59,6 +62,7 @@
5962

6063
import java.io.IOException;
6164
import java.io.InputStream;
65+
import java.nio.file.Path;
6266
import java.time.Duration;
6367
import java.time.ZonedDateTime;
6468
import java.util.Date;
@@ -96,6 +100,10 @@ public class AutodetectProcessManager extends AbstractComponent {
96100
public static final Setting<Integer> MAX_OPEN_JOBS_PER_NODE =
97101
Setting.intSetting("xpack.ml.max_open_jobs", MAX_RUNNING_JOBS_PER_NODE, 1, Property.NodeScope);
98102

103+
// Undocumented setting for integration test purposes
104+
public static final Setting<ByteSizeValue> MIN_DISK_SPACE_OFF_HEAP =
105+
Setting.byteSizeSetting("xpack.ml.min_disk_space_off_heap", new ByteSizeValue(5, ByteSizeUnit.GB), Property.NodeScope);
106+
99107
private final Client client;
100108
private final Environment environment;
101109
private final ThreadPool threadPool;
@@ -107,8 +115,12 @@ public class AutodetectProcessManager extends AbstractComponent {
107115
private final JobResultsPersister jobResultsPersister;
108116
private final JobDataCountsPersister jobDataCountsPersister;
109117

118+
private NativeStorageProvider nativeStorageProvider;
110119
private final ConcurrentMap<Long, ProcessContext> processByAllocation = new ConcurrentHashMap<>();
111120

121+
// a map that manages the allocation of temporary space to jobs
122+
private final ConcurrentMap<String, Path> nativeTmpStorage = new ConcurrentHashMap<>();
123+
112124
private final int maxAllowedRunningJobs;
113125

114126
private final NamedXContentRegistry xContentRegistry;
@@ -133,6 +145,15 @@ public AutodetectProcessManager(Environment environment, Settings settings, Clie
133145
this.jobResultsPersister = jobResultsPersister;
134146
this.jobDataCountsPersister = jobDataCountsPersister;
135147
this.auditor = auditor;
148+
this.nativeStorageProvider = new NativeStorageProvider(environment, MIN_DISK_SPACE_OFF_HEAP.get(settings));
149+
}
150+
151+
public void onNodeStartup() {
152+
try {
153+
nativeStorageProvider.cleanupLocalTmpStorageInCaseOfUncleanShutdown();
154+
} catch (Exception e) {
155+
logger.warn("Failed to cleanup native storage from previous invocation", e);
156+
}
136157
}
137158

138159
public synchronized void closeAllJobsOnThisNode(String reason) throws IOException {
@@ -251,17 +272,40 @@ public void flushJob(JobTask jobTask, FlushJobParams params, ActionListener<Flus
251272
});
252273
}
253274

275+
/**
276+
* Request temporary storage to be used for the job
277+
*
278+
* @param jobTask The job task
279+
* @param requestedSize requested size
280+
* @return a Path to local storage or null if storage is not available
281+
*/
282+
public Path tryGetTmpStorage(JobTask jobTask, ByteSizeValue requestedSize) {
283+
String jobId = jobTask.getJobId();
284+
Path path = nativeTmpStorage.get(jobId);
285+
if (path == null) {
286+
path = nativeStorageProvider.tryGetLocalTmpStorage(jobId, requestedSize);
287+
if (path != null) {
288+
nativeTmpStorage.put(jobId, path);
289+
}
290+
} else if (!nativeStorageProvider.localTmpStorageHasEnoughSpace(path, requestedSize)) {
291+
// the previous tmp location ran out of disk space, do not allow further usage
292+
return null;
293+
}
294+
return path;
295+
}
296+
254297
/**
255298
* Do a forecast for the running job.
256299
*
257300
* @param jobTask The job task
258301
* @param params Forecast parameters
259302
*/
260303
public void forecastJob(JobTask jobTask, ForecastParams params, Consumer<Exception> handler) {
261-
logger.debug("Forecasting job {}", jobTask.getJobId());
304+
String jobId = jobTask.getJobId();
305+
logger.debug("Forecasting job {}", jobId);
262306
AutodetectCommunicator communicator = getOpenAutodetectCommunicator(jobTask);
263307
if (communicator == null) {
264-
String message = String.format(Locale.ROOT, "Cannot forecast because job [%s] is not open", jobTask.getJobId());
308+
String message = String.format(Locale.ROOT, "Cannot forecast because job [%s] is not open", jobId);
265309
logger.debug(message);
266310
handler.accept(ExceptionsHelper.conflictStatusException(message));
267311
return;
@@ -271,7 +315,7 @@ public void forecastJob(JobTask jobTask, ForecastParams params, Consumer<Excepti
271315
if (e == null) {
272316
handler.accept(null);
273317
} else {
274-
String msg = String.format(Locale.ROOT, "[%s] exception while forecasting job", jobTask.getJobId());
318+
String msg = String.format(Locale.ROOT, "[%s] exception while forecasting job", jobId);
275319
logger.error(msg, e);
276320
handler.accept(ExceptionsHelper.serverError(msg, e));
277321
}
@@ -477,6 +521,11 @@ private Runnable onProcessCrash(JobTask jobTask) {
477521
}
478522
}
479523
setJobState(jobTask, JobState.FAILED);
524+
try {
525+
removeTmpStorage(jobTask.getJobId());
526+
} catch (IOException e) {
527+
logger.error(new ParameterizedMessage("[{}] Failed to delete temporary files", jobTask.getJobId()), e);
528+
}
480529
};
481530
}
482531

@@ -535,6 +584,12 @@ public void closeJob(JobTask jobTask, boolean restart, String reason) {
535584
// thread that gets into this method blocks until the first thread has finished closing the job
536585
processContext.unlock();
537586
}
587+
// delete any tmp storage
588+
try {
589+
removeTmpStorage(jobId);
590+
} catch (IOException e) {
591+
logger.error(new ParameterizedMessage("[{}]Failed to delete temporary files", jobId), e);
592+
}
538593
}
539594

540595
int numberOfOpenJobs() {
@@ -613,6 +668,13 @@ public Optional<Tuple<DataCounts, ModelSizeStats>> getStatistics(JobTask jobTask
613668
return Optional.of(new Tuple<>(communicator.getDataCounts(), communicator.getModelSizeStats()));
614669
}
615670

671+
private void removeTmpStorage(String jobId) throws IOException {
672+
Path path = nativeTmpStorage.get(jobId);
673+
if (path != null) {
674+
nativeStorageProvider.cleanupLocalTmpStorage(path);
675+
}
676+
}
677+
616678
ExecutorService createAutodetectExecutorService(ExecutorService executorService) {
617679
AutodetectWorkerExecutorService autoDetectWorkerExecutor = new AutodetectWorkerExecutorService(threadPool.getThreadContext());
618680
executorService.submit(autoDetectWorkerExecutor::start);

0 commit comments

Comments
 (0)