|
| 1 | +[role="xpack"] |
| 2 | +[testenv="platinum"] |
| 3 | +[[ml-dfa-analysis-objects]] |
| 4 | +=== Analysis configuration objects |
| 5 | + |
| 6 | +{dfanalytics-cap} resources contain `analysis` objects. For example, when you |
| 7 | +create a {dfanalytics-job}, you must define the type of analysis it performs. |
| 8 | +This page lists all the available parameters that you can use in the `analysis` |
| 9 | +object grouped by {dfanalytics} types. |
| 10 | + |
| 11 | + |
| 12 | +[discrete] |
| 13 | +[[oldetection-resources]] |
| 14 | +==== {oldetection-cap} configuration objects |
| 15 | + |
| 16 | +An `outlier_detection` configuration object has the following properties: |
| 17 | + |
| 18 | +`compute_feature_influence`:: |
| 19 | +(Optional, boolean) |
| 20 | +include::{docdir}/ml/ml-shared.asciidoc[tag=compute-feature-influence] |
| 21 | + |
| 22 | +`feature_influence_threshold`:: |
| 23 | +(Optional, double) |
| 24 | +include::{docdir}/ml/ml-shared.asciidoc[tag=feature-influence-threshold] |
| 25 | + |
| 26 | +`method`:: |
| 27 | +(Optional, string) |
| 28 | +include::{docdir}/ml/ml-shared.asciidoc[tag=method] |
| 29 | + |
| 30 | +`n_neighbors`:: |
| 31 | +(Optional, integer) |
| 32 | +include::{docdir}/ml/ml-shared.asciidoc[tag=n-neighbors] |
| 33 | + |
| 34 | +`outlier_fraction`:: |
| 35 | +(Optional, double) |
| 36 | +include::{docdir}/ml/ml-shared.asciidoc[tag=outlier-fraction] |
| 37 | + |
| 38 | +`standardization_enabled`:: |
| 39 | +(Optional, boolean) |
| 40 | +include::{docdir}/ml/ml-shared.asciidoc[tag=standardization-enabled] |
| 41 | + |
| 42 | + |
| 43 | +[discrete] |
| 44 | +[[regression-resources]] |
| 45 | +==== {regression-cap} configuration objects |
| 46 | + |
| 47 | +[source,console] |
| 48 | +-------------------------------------------------- |
| 49 | +PUT _ml/data_frame/analytics/house_price_regression_analysis |
| 50 | +{ |
| 51 | + "source": { |
| 52 | + "index": "houses_sold_last_10_yrs" <1> |
| 53 | + }, |
| 54 | + "dest": { |
| 55 | + "index": "house_price_predictions" <2> |
| 56 | + }, |
| 57 | + "analysis": |
| 58 | + { |
| 59 | + "regression": { <3> |
| 60 | + "dependent_variable": "price" <4> |
| 61 | + } |
| 62 | + } |
| 63 | +} |
| 64 | +-------------------------------------------------- |
| 65 | +// TEST[skip:TBD] |
| 66 | + |
| 67 | +<1> Training data is taken from source index `houses_sold_last_10_yrs`. |
| 68 | +<2> Analysis results will be output to destination index |
| 69 | +`house_price_predictions`. |
| 70 | +<3> The regression analysis configuration object. |
| 71 | +<4> Regression analysis will use field `price` to train on. As no other |
| 72 | +parameters have been specified it will train on 100% of eligible data, store its |
| 73 | +prediction in destination index field `price_prediction` and use in-built |
| 74 | +hyperparameter optimization to give minimum validation errors. |
| 75 | + |
| 76 | + |
| 77 | +[float] |
| 78 | +[[regression-resources-standard]] |
| 79 | +===== Standard parameters |
| 80 | + |
| 81 | +`dependent_variable`:: |
| 82 | +(Required, string) |
| 83 | +include::{docdir}/ml/ml-shared.asciidoc[tag=dependent-variable] |
| 84 | ++ |
| 85 | +-- |
| 86 | +The data type of the field must be numeric. |
| 87 | +-- |
| 88 | + |
| 89 | +`prediction_field_name`:: |
| 90 | +(Optional, string) |
| 91 | +include::{docdir}/ml/ml-shared.asciidoc[tag=prediction-field-name] |
| 92 | + |
| 93 | +`training_percent`:: |
| 94 | +(Optional, integer) |
| 95 | +include::{docdir}/ml/ml-shared.asciidoc[tag=training-percent] |
| 96 | + |
| 97 | +`randomize_seed`:: |
| 98 | +(Optional, long) |
| 99 | +include::{docdir}/ml/ml-shared.asciidoc[tag=randomize-seed] |
| 100 | + |
| 101 | + |
| 102 | +[float] |
| 103 | +[[regression-resources-advanced]] |
| 104 | +===== Advanced parameters |
| 105 | + |
| 106 | +Advanced parameters are for fine-tuning {reganalysis}. They are set |
| 107 | +automatically by <<ml-hyperparam-optimization,hyperparameter optimization>> |
| 108 | +to give minimum validation error. It is highly recommended to use the default |
| 109 | +values unless you fully understand the function of these parameters. If these |
| 110 | +parameters are not supplied, their values are automatically tuned to give |
| 111 | +minimum validation error. |
| 112 | + |
| 113 | +`eta`:: |
| 114 | +(Optional, double) |
| 115 | +include::{docdir}/ml/ml-shared.asciidoc[tag=eta] |
| 116 | + |
| 117 | +`feature_bag_fraction`:: |
| 118 | +(Optional, double) |
| 119 | +include::{docdir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction] |
| 120 | + |
| 121 | +`maximum_number_trees`:: |
| 122 | +(Optional, integer) |
| 123 | +include::{docdir}/ml/ml-shared.asciidoc[tag=maximum-number-trees] |
| 124 | + |
| 125 | +`gamma`:: |
| 126 | +(Optional, double) |
| 127 | +include::{docdir}/ml/ml-shared.asciidoc[tag=gamma] |
| 128 | + |
| 129 | +`lambda`:: |
| 130 | +(Optional, double) |
| 131 | +include::{docdir}/ml/ml-shared.asciidoc[tag=lambda] |
| 132 | + |
| 133 | + |
| 134 | +[discrete] |
| 135 | +[[classification-resources]] |
| 136 | +==== {classification-cap} configuration objects |
| 137 | + |
| 138 | + |
| 139 | +[float] |
| 140 | +[[classification-resources-standard]] |
| 141 | +===== Standard parameters |
| 142 | + |
| 143 | +`dependent_variable`:: |
| 144 | +(Required, string) |
| 145 | +include::{docdir}/ml/ml-shared.asciidoc[tag=dependent-variable] |
| 146 | ++ |
| 147 | +-- |
| 148 | +The data type of the field must be numeric (`integer`, `short`, `long`, `byte`), |
| 149 | +categorical (`ip`, `keyword`, `text`), or boolean. |
| 150 | +-- |
| 151 | + |
| 152 | +`num_top_classes`:: |
| 153 | +(Optional, integer) |
| 154 | +include::{docdir}/ml/ml-shared.asciidoc[tag=num-top-classes] |
| 155 | + |
| 156 | +`prediction_field_name`:: |
| 157 | +(Optional, string) |
| 158 | +include::{docdir}/ml/ml-shared.asciidoc[tag=prediction-field-name] |
| 159 | + |
| 160 | +`training_percent`:: |
| 161 | +(Optional, integer) |
| 162 | +include::{docdir}/ml/ml-shared.asciidoc[tag=training-percent] |
| 163 | + |
| 164 | +`randomize_seed`:: |
| 165 | +(Optional, long) |
| 166 | +include::{docdir}/ml/ml-shared.asciidoc[tag=randomize-seed] |
| 167 | + |
| 168 | + |
| 169 | +[float] |
| 170 | +[[classification-resources-advanced]] |
| 171 | +===== Advanced parameters |
| 172 | + |
| 173 | +Advanced parameters are for fine-tuning {classanalysis}. They are set |
| 174 | +automatically by <<ml-hyperparam-optimization,hyperparameter optimization>> |
| 175 | +to give minimum validation error. It is highly recommended to use the default |
| 176 | +values unless you fully understand the function of these parameters. If these |
| 177 | +parameters are not supplied, their values are automatically tuned to give |
| 178 | +minimum validation error. |
| 179 | + |
| 180 | +`eta`:: |
| 181 | +(Optional, double) |
| 182 | +include::{docdir}/ml/ml-shared.asciidoc[tag=eta] |
| 183 | + |
| 184 | +`feature_bag_fraction`:: |
| 185 | +(Optional, double) |
| 186 | +include::{docdir}/ml/ml-shared.asciidoc[tag=feature-bag-fraction] |
| 187 | + |
| 188 | +`maximum_number_trees`:: |
| 189 | +(Optional, integer) |
| 190 | +include::{docdir}/ml/ml-shared.asciidoc[tag=maximum-number-trees] |
| 191 | + |
| 192 | +`gamma`:: |
| 193 | +(Optional, double) |
| 194 | +include::{docdir}/ml/ml-shared.asciidoc[tag=gamma] |
| 195 | + |
| 196 | +`lambda`:: |
| 197 | +(Optional, double) |
| 198 | +include::{docdir}/ml/ml-shared.asciidoc[tag=lambda] |
| 199 | + |
| 200 | +[discrete] |
| 201 | +[[ml-hyperparam-optimization]] |
| 202 | +==== Hyperparameter optimization |
| 203 | + |
| 204 | +If you don't supply {regression} or {classification} parameters, hyperparameter |
| 205 | +optimization will be performed by default to set a value for the undefined |
| 206 | +parameters. The starting point is calculated for data dependent parameters by |
| 207 | +examining the loss on the training data. Subject to the size constraint, this |
| 208 | +operation provides an upper bound on the improvement in validation loss. |
| 209 | + |
| 210 | +A fixed number of rounds is used for optimization which depends on the number of |
| 211 | +parameters being optimized. The optimization starts with random search, then |
| 212 | +Bayesian optimization is performed that is targeting maximum expected |
| 213 | +improvement. If you override any parameters, then the optimization will |
| 214 | +calculate the value of the remaining parameters accordingly and use the value |
| 215 | +you provided for the overridden parameter. The number of rounds are reduced |
| 216 | +respectively. The validation error is estimated in each round by using 4-fold |
| 217 | +cross validation. |
0 commit comments