elastic · benwtrent · Dec 3, 2020 · Dec 2, 2020 · Dec 3, 2020 · Dec 3, 2020
diff --git a/...rc/main/java/org/elasticsearch/xpack/autoscaling/action/GetAutoscalingCapacityAction.java b/...rc/main/java/org/elasticsearch/xpack/autoscaling/action/GetAutoscalingCapacityAction.java
@@ -112,6 +112,9 @@ public int hashCode() {
             return Objects.hash(results);
         }
 
+        public Map<String, AutoscalingDeciderResults> getResults() {
+            return results;
+        }
     }
 
 }
diff --git a/.../plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/AnalysisLimits.java b/.../plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/config/AnalysisLimits.java
@@ -87,17 +87,17 @@ public AnalysisLimits(Long categorizationExamplesLimit) {
         this(DEFAULT_MODEL_MEMORY_LIMIT_MB, categorizationExamplesLimit);
     }
 
-    public AnalysisLimits(Long modelMemoryLimit, Long categorizationExamplesLimit) {
-        if (modelMemoryLimit != null && modelMemoryLimit < 1) {
-            String msg = Messages.getMessage(Messages.JOB_CONFIG_MODEL_MEMORY_LIMIT_TOO_LOW, modelMemoryLimit, "1 MiB");
+    public AnalysisLimits(Long modelMemoryLimitMb, Long categorizationExamplesLimit) {
+        if (modelMemoryLimitMb != null && modelMemoryLimitMb < 1) {
+            String msg = Messages.getMessage(Messages.JOB_CONFIG_MODEL_MEMORY_LIMIT_TOO_LOW, modelMemoryLimitMb, "1 MiB");
             throw ExceptionsHelper.badRequestException(msg);
         }
         if (categorizationExamplesLimit != null && categorizationExamplesLimit < 0) {
             String msg = Messages.getMessage(Messages.JOB_CONFIG_FIELD_VALUE_TOO_LOW, CATEGORIZATION_EXAMPLES_LIMIT, 0,
                     categorizationExamplesLimit);
             throw ExceptionsHelper.badRequestException(msg);
         }
-        this.modelMemoryLimit = modelMemoryLimit;
+        this.modelMemoryLimit = modelMemoryLimitMb;
         this.categorizationExamplesLimit = categorizationExamplesLimit;
     }
 

diff --git a/...ode-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AutoscalingIT.java b/...ode-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AutoscalingIT.java
@@ -0,0 +1,178 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+package org.elasticsearch.xpack.ml.integration;
+
+import org.elasticsearch.action.admin.cluster.node.info.NodeInfo;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.xpack.autoscaling.Autoscaling;
+import org.elasticsearch.xpack.autoscaling.action.GetAutoscalingCapacityAction;
+import org.elasticsearch.xpack.autoscaling.action.PutAutoscalingPolicyAction;
+import org.elasticsearch.xpack.autoscaling.capacity.AutoscalingDeciderResult;
+import org.elasticsearch.xpack.autoscaling.capacity.AutoscalingDeciderResults;
+import org.elasticsearch.xpack.core.ml.job.config.AnalysisConfig;
+import org.elasticsearch.xpack.core.ml.job.config.AnalysisLimits;
+import org.elasticsearch.xpack.core.ml.job.config.DataDescription;
+import org.elasticsearch.xpack.core.ml.job.config.Detector;
+import org.elasticsearch.xpack.core.ml.job.config.Job;
+import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.autoscaling.MlAutoscalingDeciderService;
+import org.elasticsearch.xpack.ml.autoscaling.NativeMemoryCapacity;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.hasKey;
+
+public class AutoscalingIT extends MlNativeAutodetectIntegTestCase {
+
+    private static final long BASIC_REQUIREMENT_MB = 10;
+    private static final long NATIVE_PROCESS_OVERHEAD_MB = 30;
+    private static final long BASELINE_OVERHEAD_MB = BASIC_REQUIREMENT_MB + NATIVE_PROCESS_OVERHEAD_MB;
+
+    // This test assumes that xpack.ml.max_machine_memory_percent is 30
+    // and that xpack.ml.use_auto_machine_memory_percent is false
+    public void testMLAutoscalingCapacity() {
+        SortedMap<String, Settings> deciders = new TreeMap<>();
+        deciders.put(MlAutoscalingDeciderService.NAME,
+            Settings.builder().put(MlAutoscalingDeciderService.DOWN_SCALE_DELAY.getKey(), TimeValue.ZERO).build());
+        final PutAutoscalingPolicyAction.Request request = new PutAutoscalingPolicyAction.Request(
+            "ml_test",
+            new TreeSet<>(),
+            deciders
+        );
+        assertAcked(client().execute(PutAutoscalingPolicyAction.INSTANCE, request).actionGet());
+
+        assertMlCapacity(
+            client().execute(
+                GetAutoscalingCapacityAction.INSTANCE,
+                new GetAutoscalingCapacityAction.Request()
+            ).actionGet(),
+            "Requesting scale down as tier and/or node size could be smaller",
+            0L,
+            0L);
+
+        putJob("job1", 100);
+        putJob("job2", 200);
+        openJob("job1");
+        openJob("job2");
+        long expectedTierBytes = (long)Math.ceil(
+            ByteSizeValue.ofMb(100 + BASELINE_OVERHEAD_MB + 200 + BASELINE_OVERHEAD_MB).getBytes() * 100 / 30.0
+        );
+        long expectedNodeBytes = (long)Math.ceil(ByteSizeValue.ofMb(200 + BASELINE_OVERHEAD_MB).getBytes() * 100 / 30.0);
+
+        assertMlCapacity(
+            client().execute(
+                GetAutoscalingCapacityAction.INSTANCE,
+                new GetAutoscalingCapacityAction.Request()
+            ).actionGet(),
+            "Requesting scale down as tier and/or node size could be smaller",
+            expectedTierBytes,
+            expectedNodeBytes);
+
+        putJob("bigjob1", 60_000);
+        putJob("bigjob2", 50_000);
+        openJob("bigjob1");
+        openJob("bigjob2");
+        List<DiscoveryNode> mlNodes = admin()
+            .cluster()
+            .prepareNodesInfo()
+            .all()
+            .get()
+            .getNodes()
+            .stream()
+            .map(NodeInfo::getNode)
+            .filter(MachineLearning::isMlNode)
+            .collect(Collectors.toList());
+        NativeMemoryCapacity currentScale = MlAutoscalingDeciderService.currentScale(mlNodes, 30, false);
+        expectedTierBytes = (long)Math.ceil(
+            (ByteSizeValue.ofMb(50_000 + BASIC_REQUIREMENT_MB + 60_000 + BASIC_REQUIREMENT_MB).getBytes()
+                + currentScale.getTier()
+            ) * 100 / 30.0
+        );
+        expectedNodeBytes = (long)Math.ceil(ByteSizeValue.ofMb(60_000 + BASELINE_OVERHEAD_MB).getBytes() * 100 / 30.0);
+
+
+        assertMlCapacity(
+            client().execute(
+                GetAutoscalingCapacityAction.INSTANCE,
+                new GetAutoscalingCapacityAction.Request()
+            ).actionGet(),
+            "requesting scale up as number of jobs in queues exceeded configured limit",
+            expectedTierBytes,
+            expectedNodeBytes);
+
+        expectedTierBytes = (long)Math.ceil(
+            ByteSizeValue.ofMb(100 + BASELINE_OVERHEAD_MB + 200 + BASELINE_OVERHEAD_MB).getBytes() * 100 / 30.0
+        );
+        expectedNodeBytes = (long)Math.ceil(ByteSizeValue.ofMb(200 + BASELINE_OVERHEAD_MB).getBytes() * 100 / 30.0);
+        closeJob("bigjob1");
+        closeJob("bigjob2");
+
+        assertMlCapacity(
+            client().execute(
+                GetAutoscalingCapacityAction.INSTANCE,
+                new GetAutoscalingCapacityAction.Request()
+            ).actionGet(),
+            "Requesting scale down as tier and/or node size could be smaller",
+            expectedTierBytes,
+            expectedNodeBytes);
+        closeJob("job1");
+        closeJob("job2");
+
+        assertMlCapacity(
+            client().execute(
+                GetAutoscalingCapacityAction.INSTANCE,
+                new GetAutoscalingCapacityAction.Request()
+            ).actionGet(),
+            "Requesting scale down as tier and/or node size could be smaller",
+            0L,
+            0L);
+    }
+
+    private void assertMlCapacity(GetAutoscalingCapacityAction.Response capacity, String reason, long tierBytes, long nodeBytes) {
+        assertThat(capacity.getResults(), hasKey("ml_test"));
+        AutoscalingDeciderResults autoscalingDeciderResults = capacity.getResults().get("ml_test");
+        assertThat(autoscalingDeciderResults.results(), hasKey("ml"));
+
+        AutoscalingDeciderResult autoscalingDeciderResult = autoscalingDeciderResults.results().get("ml");
+        assertThat(autoscalingDeciderResult.reason().summary(), containsString(reason));
+        assertThat(autoscalingDeciderResult.requiredCapacity().tier().memory().getBytes(), equalTo(tierBytes));
+        assertThat(autoscalingDeciderResult.requiredCapacity().node().memory().getBytes(), equalTo(nodeBytes));
+    }
+
+    private void putJob(String jobId, long limitMb) {
+        Job.Builder job =
+            new Job.Builder(jobId)
+                .setAllowLazyOpen(true)
+                .setAnalysisLimits(new AnalysisLimits(limitMb, null))
+                .setAnalysisConfig(
+                    new AnalysisConfig.Builder((List<Detector>) null)
+                        .setBucketSpan(TimeValue.timeValueHours(1))
+                        .setDetectors(
+                            Collections.singletonList(
+                                new Detector.Builder("count", null)
+                                    .setPartitionFieldName("user")
+                                    .build())))
+                .setDataDescription(
+                    new DataDescription.Builder()
+                        .setTimeFormat("epoch"));
+
+        registerJob(job);
+        putJob(job);
+    }
+}
diff --git a/...s/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/MlNativeIntegTestCase.java b/...s/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/MlNativeIntegTestCase.java
@@ -6,6 +6,10 @@
 package org.elasticsearch.xpack.ml.integration;
 
 import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksRequest;
+import org.elasticsearch.cluster.NamedDiff;
+import org.elasticsearch.xpack.autoscaling.Autoscaling;
+import org.elasticsearch.xpack.autoscaling.AutoscalingMetadata;
+import org.elasticsearch.xpack.autoscaling.capacity.AutoscalingDeciderResult;
 import org.elasticsearch.xpack.core.action.CreateDataStreamAction;
 import org.elasticsearch.action.admin.indices.template.put.PutComposableIndexTemplateAction;
 import org.elasticsearch.action.support.master.AcknowledgedResponse;
@@ -72,6 +76,9 @@
 import org.elasticsearch.xpack.core.slm.history.SnapshotLifecycleTemplateRegistry;
 import org.elasticsearch.xpack.datastreams.DataStreamsPlugin;
 import org.elasticsearch.xpack.ilm.IndexLifecycle;
+import org.elasticsearch.xpack.ml.LocalStateMachineLearning;
+import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.autoscaling.MlScalingReason;
 
 import java.io.IOException;
 import java.net.URISyntaxException;
@@ -106,7 +113,10 @@ protected NamedXContentRegistry xContentRegistry() {
     protected Collection<Class<? extends Plugin>> nodePlugins() {
         return Arrays.asList(
             LocalStateCompositeXPackPlugin.class,
+            MachineLearning.class,
             Netty4Plugin.class,
+            Autoscaling.class,
+            ReindexPlugin.class,
             // The monitoring plugin requires script and gsub processors to be loaded
             IngestCommonPlugin.class,
             // The monitoring plugin script processor references painless. Include this for script compilation.
@@ -121,6 +131,8 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
     protected Collection<Class<? extends Plugin>> transportClientPlugins() {
         return Arrays.asList(
             XPackClientPlugin.class,
+            Autoscaling.class,
+            MachineLearning.class,
             Netty4Plugin.class,
             ReindexPlugin.class,
             // ILM is required for .ml-state template index settings
@@ -144,6 +156,7 @@ protected Settings externalClusterClientSettings() {
         builder.put(SecurityField.USER_SETTING.getKey(), "x_pack_rest_user:" + SecuritySettingsSourceField.TEST_PASSWORD_SECURE_STRING);
         builder.put(XPackSettings.MACHINE_LEARNING_ENABLED.getKey(), true);
         builder.put(XPackSettings.WATCHER_ENABLED.getKey(), false);
+        builder.put(Autoscaling.AUTOSCALING_ENABLED_SETTING.getKey(), true);
         builder.put(LifecycleSettings.LIFECYCLE_HISTORY_INDEX_ENABLED_SETTING.getKey(), false);
         builder.put(LifecycleSettings.SLM_HISTORY_INDEX_ENABLED_SETTING.getKey(), false);
         builder.put("xpack.security.transport.ssl.enabled", true);
@@ -236,18 +249,25 @@ protected void ensureClusterStateConsistency() throws IOException {
             entries.add(new NamedWriteableRegistry.Entry(LifecycleAction.class, DeleteAction.NAME, DeleteAction::new));
             entries.add(new NamedWriteableRegistry.Entry(LifecycleAction.class, RolloverAction.NAME, RolloverAction::new));
             entries.add(new NamedWriteableRegistry.Entry(PersistentTaskParams.class, MlTasks.DATAFEED_TASK_NAME,
-                    StartDatafeedAction.DatafeedParams::new));
+                StartDatafeedAction.DatafeedParams::new));
             entries.add(new NamedWriteableRegistry.Entry(PersistentTaskParams.class, MlTasks.DATA_FRAME_ANALYTICS_TASK_NAME,
                 StartDataFrameAnalyticsAction.TaskParams::new));
             entries.add(new NamedWriteableRegistry.Entry(PersistentTaskParams.class, MlTasks.JOB_TASK_NAME,
-                    OpenJobAction.JobParams::new));
-            entries.add(new NamedWriteableRegistry.Entry(PersistentTaskParams.class, MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME,
-                SnapshotUpgradeTaskParams::new));
+                OpenJobAction.JobParams::new));
             entries.add(new NamedWriteableRegistry.Entry(PersistentTaskState.class, JobTaskState.NAME, JobTaskState::new));
             entries.add(new NamedWriteableRegistry.Entry(PersistentTaskState.class, DatafeedState.NAME, DatafeedState::fromStream));
             entries.add(new NamedWriteableRegistry.Entry(PersistentTaskState.class, DataFrameAnalyticsTaskState.NAME,
                 DataFrameAnalyticsTaskState::new));
             entries.add(new NamedWriteableRegistry.Entry(ClusterState.Custom.class, TokenMetadata.TYPE, TokenMetadata::new));
+            entries.add(new NamedWriteableRegistry.Entry(Metadata.Custom.class, AutoscalingMetadata.NAME, AutoscalingMetadata::new));
+            entries.add(new NamedWriteableRegistry.Entry(NamedDiff.class,
+                AutoscalingMetadata.NAME,
+                AutoscalingMetadata.AutoscalingMetadataDiff::new));
+            entries.add(new NamedWriteableRegistry.Entry(
+                AutoscalingDeciderResult.Reason.class,
+                MlScalingReason.NAME,
+                MlScalingReason::new
+            ));
             final NamedWriteableRegistry namedWriteableRegistry = new NamedWriteableRegistry(entries);
             ClusterState masterClusterState = client().admin().cluster().prepareState().all().get().getState();
             byte[] masterClusterStateBytes = ClusterState.Builder.toBytes(masterClusterState);

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java
@@ -435,7 +435,7 @@ public Set<DiscoveryNodeRole> getRoles() {
         false,
         Property.NodeScope);
     public static final Setting<Integer> MAX_LAZY_ML_NODES =
-            Setting.intSetting("xpack.ml.max_lazy_ml_nodes", 0, 0, 3, Property.Dynamic, Property.NodeScope);
+            Setting.intSetting("xpack.ml.max_lazy_ml_nodes", 0, 0, Property.Dynamic, Property.NodeScope);
 
     // Before 8.0.0 this needs to match the max allowed value for xpack.ml.max_open_jobs,
     // as the current node could be running in a cluster where some nodes are still using

diff --git a/.../ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java b/.../ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java
@@ -63,9 +63,10 @@ public class MlAutoscalingDeciderService implements AutoscalingDeciderService,
     private static final String MEMORY_STALE = "unable to make scaling decision as job memory requirements are stale";
     private static final long NO_SCALE_DOWN_POSSIBLE = -1L;
 
+    public static final String NAME = "ml";
     public static final Setting<Integer> NUM_ANOMALY_JOBS_IN_QUEUE = Setting.intSetting("num_anomaly_jobs_in_queue", 0, 0);
     public static final Setting<Integer> NUM_ANALYTICS_JOBS_IN_QUEUE = Setting.intSetting("num_analytics_jobs_in_queue", 0, 0);
-    public static final Setting<TimeValue> DOWN_SCALE_DELAY = Setting.timeSetting("num_analytics_jobs_in_queue", TimeValue.ZERO);
+    public static final Setting<TimeValue> DOWN_SCALE_DELAY = Setting.timeSetting("down_scale_delay", TimeValue.timeValueHours(1));
 
     private final NodeLoadDetector nodeLoadDetector;
     private final MlMemoryTracker mlMemoryTracker;
@@ -156,7 +157,8 @@ static Optional<NativeMemoryCapacity> requiredCapacityForUnassignedJobs(List<Str
         }
         jobSizes.sort(Comparator.comparingLong(Long::longValue).reversed());
         long tierMemory = 0L;
-        long nodeMemory = jobSizes.get(0);
+        // Node memory needs to be AT LEAST the size of the largest job + the required overhead.
+        long nodeMemory = jobSizes.get(0) + MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes();
         Iterator<Long> iter = jobSizes.iterator();
         while (jobSizes.size() > maxNumInQueue && iter.hasNext()) {
             tierMemory += iter.next();
@@ -226,7 +228,9 @@ private boolean newScaleDownCheck() {
         return scaleDownDetected == NO_SCALE_DOWN_POSSIBLE;
     }
 
-    NativeMemoryCapacity currentScale(final List<DiscoveryNode> machineLearningNodes) {
+    public static NativeMemoryCapacity currentScale(final List<DiscoveryNode> machineLearningNodes,
+                                                    int maxMachineMemoryPercent,
+                                                    boolean useAuto) {
         long[] mlMemory = machineLearningNodes.stream()
             .mapToLong(node -> NativeMemoryCalculator.allowedBytesForMl(node, maxMachineMemoryPercent, useAuto).orElse(0L))
             .toArray();
@@ -244,6 +248,10 @@ NativeMemoryCapacity currentScale(final List<DiscoveryNode> machineLearningNodes
         );
     }
 
+    NativeMemoryCapacity currentScale(final List<DiscoveryNode> machineLearningNodes) {
+        return currentScale(machineLearningNodes, maxMachineMemoryPercent, useAuto);
+    }
+
     @Override
     public void offMaster() {
         isMaster = false;
@@ -567,14 +575,16 @@ Optional<AutoscalingDeciderResult> checkForScaleDown(List<DiscoveryNode> nodes,
             return Optional.empty();
         }
         long currentlyNecessaryTier = nodeLoads.stream().mapToLong(NodeLoad::getAssignedJobMemory).sum();
+        // The required NATIVE node memory is the largest job and our static overhead.
+        long currentlyNecessaryNode = largestJob == 0 ? 0 : largestJob + MachineLearning.NATIVE_EXECUTABLE_CODE_OVERHEAD.getBytes();
         // We consider a scale down if we are not fully utilizing the tier
         // Or our largest job could be on a smaller node (meaning the same size tier but smaller nodes are possible).
-        if (currentlyNecessaryTier < currentCapacity.getTier() || largestJob < currentCapacity.getNode()) {
+        if (currentlyNecessaryTier < currentCapacity.getTier() || currentlyNecessaryNode < currentCapacity.getNode()) {
             NativeMemoryCapacity nativeMemoryCapacity = new NativeMemoryCapacity(
                 currentlyNecessaryTier,
-                largestJob,
+                currentlyNecessaryNode,
                 // If our newly suggested native capacity is the same, we can use the previously stored jvm size
-                largestJob == currentCapacity.getNode() ? currentCapacity.getJvmSize() : null);
+                currentlyNecessaryNode == currentCapacity.getNode() ? currentCapacity.getJvmSize() : null);
             return Optional.of(
                 new AutoscalingDeciderResult(
                     nativeMemoryCapacity.autoscalingCapacity(maxMachineMemoryPercent, useAuto),
@@ -588,7 +598,7 @@ Optional<AutoscalingDeciderResult> checkForScaleDown(List<DiscoveryNode> nodes,
 
     @Override
     public String name() {
-        return "ml";
+        return NAME;
     }
 
     @Override

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlScalingReason.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlScalingReason.java
@@ -21,7 +21,7 @@
 
 public class MlScalingReason implements AutoscalingDeciderResult.Reason {
 
-    static final String NAME = "ml";
+    public static final String NAME = MlAutoscalingDeciderService.NAME;
     static final String WAITING_ANALYTICS_JOBS = "waiting_analytics_jobs";
     static final String WAITING_ANOMALY_JOBS = "waiting_anomaly_jobs";
     static final String CONFIGURATION = "configuration";