diff --git a/docs/reference/transform/index.asciidoc b/docs/reference/transform/index.asciidoc index 595cbdef56d98..bdf506c6f695b 100644 --- a/docs/reference/transform/index.asciidoc +++ b/docs/reference/transform/index.asciidoc @@ -15,6 +15,7 @@ your data. * <> * <> * <> +* <> * <> * <> @@ -24,5 +25,6 @@ include::checkpoints.asciidoc[] include::api-quickref.asciidoc[] include::ecommerce-tutorial.asciidoc[] include::examples.asciidoc[] +include::painless-examples.asciidoc[] include::troubleshooting.asciidoc[] include::limitations.asciidoc[] \ No newline at end of file diff --git a/docs/reference/transform/painless-examples.asciidoc b/docs/reference/transform/painless-examples.asciidoc new file mode 100644 index 0000000000000..8a2a4ec7386e8 --- /dev/null +++ b/docs/reference/transform/painless-examples.asciidoc @@ -0,0 +1,329 @@ +[role="xpack"] +[testenv="basic"] +[[transform-painless-examples]] +=== Painless examples for {transforms} +++++ +Painless examples for {transforms} +++++ + +These examples demonstrate how to use Painless in {transforms}. You can learn +more about the Painless scripting language in the +{painless}/painless-guide.html[Painless guide]. + +* <> +* <> +* <> +* <> + + +[discrete] +[[painless-top-hits]] +==== Getting top hits by using scripted metric + +This snippet shows how to find the latest document, in other words the document +with the earliest timestamp. From a technical perspective, it helps to achieve +the function of a <> by using +scripted metric aggregation which provides a metric output. + +[source,js] +-------------------------------------------------- +"latest_doc": { + "scripted_metric": { + "init_script": "state.timestamp_latest = 0L; state.last_doc = ''", <1> + "map_script": """ <2> + def current_date = doc['@timestamp'].getValue().toInstant().toEpochMilli(); + if (current_date > state.timestamp_latest) + {state.timestamp_latest = current_date; + state.last_doc = new HashMap(params['_source']);} + """, + "combine_script": "return state", <3> + "reduce_script": """ <4> + def last_doc = ''; + def timestamp_latest = 0L; + for (s in states) {if (s.timestamp_latest > (timestamp_latest)) + {timestamp_latest = s.timestamp_latest; last_doc = s.last_doc;}} + return last_doc + """ + } +} +-------------------------------------------------- +// NOTCONSOLE + +<1> The `init_script` creates a long type `timestamp_latest` and a string type +`last_doc` in the `state` object. +<2> The `map_script` defines `current_date` based on the timestamp of the +document, then compares `current_date` with `state.timestamp_latest`, finally +returns `state.last_doc` from the shard. By using `new HashMap(...)` we copy the +source document, this is important whenever you want to pass the full source +object from one phase to the next. +<3> The `combine_script` returns `state` from each shard. +<4> The `reduce_script` iterates through the value of `s.timestamp_latest` +returned by each shard and returns the document with the latest timestamp +(`last_doc`). In the response, the top hit (in other words, the `latest_doc`) is +nested below the `latest_doc` field. + +Check the +<> +for detailed explanation on the respective scripts. + +You can retrieve the last value in a similar way: + +[source,js] +-------------------------------------------------- +"latest_value": { + "scripted_metric": { + "init_script": "state.timestamp_latest = 0L; state.last_value = ''", + "map_script": """ + def current_date = doc['date'].getValue().toInstant().toEpochMilli(); + if (current_date > state.timestamp_latest) + {state.timestamp_latest = current_date; + state.last_value = params['_source']['value'];} + """, + "combine_script": "return state", + "reduce_script": """ + def last_value = ''; + def timestamp_latest = 0L; + for (s in states) {if (s.timestamp_latest > (timestamp_latest)) + {timestamp_latest = s.timestamp_latest; last_value = s.last_value;}} + return last_value + """ + } +} +-------------------------------------------------- +// NOTCONSOLE + + +[discrete] +[[painless-time-features]] +==== Getting time features as scripted fields + +This snippet shows how to extract time based features by using Painless. The +snippet uses an index where `@timestamp` is defined as a `date` type field. + +[source,js] +-------------------------------------------------- +"script_fields": { + "hour_of_day": { <1> + "script": { + "lang": "painless", + "source": """ + ZonedDateTime date = doc['@timestamp'].value; <2> + return date.getHour(); <3> + """ + } + }, + "month_of_year": { <4> + "script": { + "lang": "painless", + "source": """ + ZonedDateTime date = doc['@timestamp'].value; <5> + return date.getMonthValue(); <6> + """ + } + } + } +-------------------------------------------------- +// NOTCONSOLE + +<1> Contains the Painless script that returns the hour of the day. +<2> Sets `date` based on the timestamp of the document. +<3> Returns the hour value from `date`. +<4> Contains the Painless script that returns the month of the year. +<5> Sets `date` based on the timestamp of the document. +<6> Returns the month value from `date`. + + +[discrete] +[[painless-group-by]] +==== Using Painless in `group_by` + +It is possible to base the `group_by` property of a {transform} on the output of +a script. The following example uses the {kib} sample web logs dataset. The goal +here is to make the {transform} output easier to understand through normalizing +the value of the fields that the data is grouped by. + +[source,console] +-------------------------------------------------- +POST _transform/_preview +{ + "source": { + "index": [ <1> + "kibana_sample_data_logs" + ] + }, + "pivot": { + "group_by": { + "agent": { + "terms": { + "script": { <2> + "source": """String agent = doc['agent.keyword'].value; + if (agent.contains("MSIE")) { + return "internet explorer"; + } else if (agent.contains("AppleWebKit")) { + return "safari"; + } else if (agent.contains('Firefox')) { + return "firefox"; + } else { return agent }""", + "lang": "painless" + } + } + } + }, + "aggregations": { <3> + "200": { + "filter": { + "term": { + "response": "200" + } + } + }, + "404": { + "filter": { + "term": { + "response": "404" + } + } + }, + "503": { + "filter": { + "term": { + "response": "503" + } + } + } + } + }, + "dest": { <4> + "index": "pivot_logs" + } +} +-------------------------------------------------- +// TEST[skip:setup kibana sample data] + +<1> Specifies the source index or indices. +<2> The script defines an `agent` string based on the `agent` field of the +documents, then iterates through the values. If an `agent` field contains +"MSIE", than the script returns "Internet Explorer". If it contains +`AppleWebKit`, it returns "safari". It returns "firefox" if the field value +contains "Firefox". Finally, in every other case, the value of the field is +returned. +<3> The aggregations object contains filters that narrow down the results to +documents that contains `200`, `404`, or `503` values in the `response` field. +<4> Specifies the destination index of the {transform}. + +The API returns the following result: + +[source,js] +-------------------------------------------------- +{ + "preview" : [ + { + "agent" : "firefox", + "200" : 4931, + "404" : 259, + "503" : 172 + }, + { + "agent" : "internet explorer", + "200" : 3674, + "404" : 210, + "503" : 126 + }, + { + "agent" : "safari", + "200" : 4227, + "404" : 332, + "503" : 143 + } + ], + "mappings" : { + "properties" : { + "200" : { + "type" : "long" + }, + "agent" : { + "type" : "keyword" + }, + "404" : { + "type" : "long" + }, + "503" : { + "type" : "long" + } + } + } +} +-------------------------------------------------- +// NOTCONSOLE + +You can see that the `agent` values are simplified so it is easier to interpret +them. The table below shows how normalization modifies the output of the +{transform} in our example compared to the non-normalized values. + +[width="50%"] + +|=== +| Non-normalized `agent` value | Normalized `agent` value + +| "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)" | "internet explorer" +| "Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.50 Safari/534.24" | "safari" +| "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1" | "firefox" +|=== + + +[discrete] +[[painless-bucket-script]] +==== Getting duration by using bucket script + +This example shows you how to get the duration of a session by client IP from a +data log by using +{ref}/search-aggregations-pipeline-bucket-script-aggregation.html[bucket script]. +The example uses the {kib} sample web logs dataset. + +[source,console] +-------------------------------------------------- +PUT _data_frame/transforms/data_log +{ + "source": { + "index": "kibana_sample_data_logs" + }, + "dest": { + "index": "data-logs-by-client" + }, + "pivot": { + "group_by": { + "machine.os": {"terms": {"field": "machine.os.keyword"}}, + "machine.ip": {"terms": {"field": "clientip"}} + }, + "aggregations": { + "time_frame.lte": { + "max": { + "field": "timestamp" + } + }, + "time_frame.gte": { + "min": { + "field": "timestamp" + } + }, + "time_length": { <1> + "bucket_script": { + "buckets_path": { <2> + "min": "time_frame.gte.value", + "max": "time_frame.lte.value" + }, + "script": "params.max - params.min" <3> + } + } + } + } +} +-------------------------------------------------- +// TEST[skip:setup kibana sample data] + +<1> To define the length of the sessions, we use a bucket script. +<2> The bucket path is a map of script variables and their associated path to +the buckets you want to use for the variable. In this particular case, `min` and +`max` are variables mapped to `time_frame.gte.value` and `time_frame.lte.value`. +<3> Finally, the script substracts the start date of the session from the end +date which results in the duration of the session.