Skip to content

Commit 273c35f

Browse files
authored
Add Cumulative Cardinality agg (and Data Science plugin) (#43661)
This adds a pipeline aggregation that calculates the cumulative cardinality of a field. It does this by iteratively merging in the HLL sketch from consecutive buckets and emitting the cardinality up to that point. This is useful for things like finding the total "new" users that have visited a website (as opposed to "repeat" visitors). This is a Basic+ aggregation and adds a new Data Science plugin to house it and future advanced analytics/data science aggregations.
1 parent f4703be commit 273c35f

30 files changed

+1653
-4
lines changed

distribution/build.gradle

+1
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ task run(type: RunTask) {
428428
setting 'xpack.monitoring.enabled', 'true'
429429
setting 'xpack.sql.enabled', 'true'
430430
setting 'xpack.rollup.enabled', 'true'
431+
setting 'xpack.data-science.enabled', 'true'
431432
keystoreSetting 'bootstrap.password', 'password'
432433
}
433434
}

docs/build.gradle

+36
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,42 @@ buildRestTests.setups['sales'] = '''
218218
{"index":{}}
219219
{"date": "2015/03/01 00:00:00", "price": 175, "promoted": false, "rating": 2, "type": "t-shirt"}'''
220220

221+
// Used by cumulative cardinality aggregation docs
222+
buildRestTests.setups['user_hits'] = '''
223+
- do:
224+
indices.create:
225+
index: user_hits
226+
body:
227+
settings:
228+
number_of_shards: 1
229+
number_of_replicas: 0
230+
mappings:
231+
properties:
232+
user_id:
233+
type: keyword
234+
timestamp:
235+
type: date
236+
- do:
237+
bulk:
238+
index: user_hits
239+
refresh: true
240+
body: |
241+
{"index":{}}
242+
{"timestamp": "2019-01-01T13:00:00", "user_id": "1"}
243+
{"index":{}}
244+
{"timestamp": "2019-01-01T13:00:00", "user_id": "2"}
245+
{"index":{}}
246+
{"timestamp": "2019-01-02T13:00:00", "user_id": "1"}
247+
{"index":{}}
248+
{"timestamp": "2019-01-02T13:00:00", "user_id": "3"}
249+
{"index":{}}
250+
{"timestamp": "2019-01-03T13:00:00", "user_id": "1"}
251+
{"index":{}}
252+
{"timestamp": "2019-01-03T13:00:00", "user_id": "2"}
253+
{"index":{}}
254+
{"timestamp": "2019-01-03T13:00:00", "user_id": "4"}'''
255+
256+
221257
// Dummy bank account data used by getting-started.asciidoc
222258
buildRestTests.setups['bank'] = '''
223259
- do:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
[role="xpack"]
2+
[testenv="basic"]
3+
[[search-aggregations-pipeline-cumulative-cardinality-aggregation]]
4+
=== Cumulative Cardinality Aggregation
5+
6+
A parent pipeline aggregation which calculates the Cumulative Cardinality in a parent histogram (or date_histogram)
7+
aggregation. The specified metric must be a cardinality aggregation and the enclosing histogram
8+
must have `min_doc_count` set to `0` (default for `histogram` aggregations).
9+
10+
The `cumulative_cardinality` agg is useful for finding "total new items", like the number of new visitors to your
11+
website each day. A regular cardinality aggregation will tell you how many unique visitors came each day, but doesn't
12+
differentiate between "new" or "repeat" visitors. The Cumulative Cardinality aggregation can be used to determine
13+
how many of each day's unique visitors are "new".
14+
15+
==== Syntax
16+
17+
A `cumulative_cardinality` aggregation looks like this in isolation:
18+
19+
[source,js]
20+
--------------------------------------------------
21+
{
22+
"cumulative_cardinality": {
23+
"buckets_path": "my_cardinality_agg"
24+
}
25+
}
26+
--------------------------------------------------
27+
// NOTCONSOLE
28+
29+
[[cumulative-cardinality-params]]
30+
.`cumulative_cardinality` Parameters
31+
[options="header"]
32+
|===
33+
|Parameter Name |Description |Required |Default Value
34+
|`buckets_path` |The path to the cardinality aggregation we wish to find the cumulative cardinality for (see <<buckets-path-syntax>> for more
35+
details) |Required |
36+
|`format` |format to apply to the output value of this aggregation |Optional |`null`
37+
|===
38+
39+
The following snippet calculates the cumulative cardinality of the total daily `users`:
40+
41+
[source,js]
42+
--------------------------------------------------
43+
GET /user_hits/_search
44+
{
45+
"size": 0,
46+
"aggs" : {
47+
"users_per_day" : {
48+
"date_histogram" : {
49+
"field" : "timestamp",
50+
"calendar_interval" : "day"
51+
},
52+
"aggs": {
53+
"distinct_users": {
54+
"cardinality": {
55+
"field": "user_id"
56+
}
57+
},
58+
"total_new_users": {
59+
"cumulative_cardinality": {
60+
"buckets_path": "distinct_users" <1>
61+
}
62+
}
63+
}
64+
}
65+
}
66+
}
67+
--------------------------------------------------
68+
// CONSOLE
69+
// TEST[setup:user_hits]
70+
71+
<1> `buckets_path` instructs this aggregation to use the output of the `distinct_users` aggregation for the cumulative cardinality
72+
73+
And the following may be the response:
74+
75+
[source,js]
76+
--------------------------------------------------
77+
{
78+
"took": 11,
79+
"timed_out": false,
80+
"_shards": ...,
81+
"hits": ...,
82+
"aggregations": {
83+
"users_per_day": {
84+
"buckets": [
85+
{
86+
"key_as_string": "2019-01-01T00:00:00.000Z",
87+
"key": 1546300800000,
88+
"doc_count": 2,
89+
"distinct_users": {
90+
"value": 2
91+
},
92+
"total_new_users": {
93+
"value": 2
94+
}
95+
},
96+
{
97+
"key_as_string": "2019-01-02T00:00:00.000Z",
98+
"key": 1546387200000,
99+
"doc_count": 2,
100+
"distinct_users": {
101+
"value": 2
102+
},
103+
"total_new_users": {
104+
"value": 3
105+
}
106+
},
107+
{
108+
"key_as_string": "2019-01-03T00:00:00.000Z",
109+
"key": 1546473600000,
110+
"doc_count": 3,
111+
"distinct_users": {
112+
"value": 3
113+
},
114+
"total_new_users": {
115+
"value": 4
116+
}
117+
}
118+
]
119+
}
120+
}
121+
}
122+
--------------------------------------------------
123+
// TESTRESPONSE[s/"took": 11/"took": $body.took/]
124+
// TESTRESPONSE[s/"_shards": \.\.\./"_shards": $body._shards/]
125+
// TESTRESPONSE[s/"hits": \.\.\./"hits": $body.hits/]
126+
127+
128+
Note how the second day, `2019-01-02`, has two distinct users but the `total_new_users` metric generated by the
129+
cumulative pipeline agg only increments to three. This means that only one of the two users that day were
130+
new, the other had already been seen in the previous day. This happens again on the third day, where only
131+
one of three users is completely new.
132+
133+
==== Incremental cumulative cardinality
134+
135+
The `cumulative_cardinality` agg will show you the total, distinct count since the beginning of the time period
136+
being queried. Sometimes, however, it is useful to see the "incremental" count. Meaning, how many new users
137+
are added each day, rather than the total cumulative count.
138+
139+
This can be accomplished by adding a `derivative` aggregation to our query:
140+
141+
[source,js]
142+
--------------------------------------------------
143+
GET /user_hits/_search
144+
{
145+
"size": 0,
146+
"aggs" : {
147+
"users_per_day" : {
148+
"date_histogram" : {
149+
"field" : "timestamp",
150+
"calendar_interval" : "day"
151+
},
152+
"aggs": {
153+
"distinct_users": {
154+
"cardinality": {
155+
"field": "user_id"
156+
}
157+
},
158+
"total_new_users": {
159+
"cumulative_cardinality": {
160+
"buckets_path": "distinct_users"
161+
}
162+
},
163+
"incremental_new_users": {
164+
"derivative": {
165+
"buckets_path": "total_new_users"
166+
}
167+
}
168+
}
169+
}
170+
}
171+
}
172+
--------------------------------------------------
173+
// CONSOLE
174+
// TEST[setup:user_hits]
175+
176+
177+
And the following may be the response:
178+
179+
[source,js]
180+
--------------------------------------------------
181+
{
182+
"took": 11,
183+
"timed_out": false,
184+
"_shards": ...,
185+
"hits": ...,
186+
"aggregations": {
187+
"users_per_day": {
188+
"buckets": [
189+
{
190+
"key_as_string": "2019-01-01T00:00:00.000Z",
191+
"key": 1546300800000,
192+
"doc_count": 2,
193+
"distinct_users": {
194+
"value": 2
195+
},
196+
"total_new_users": {
197+
"value": 2
198+
}
199+
},
200+
{
201+
"key_as_string": "2019-01-02T00:00:00.000Z",
202+
"key": 1546387200000,
203+
"doc_count": 2,
204+
"distinct_users": {
205+
"value": 2
206+
},
207+
"total_new_users": {
208+
"value": 3
209+
},
210+
"incremental_new_users": {
211+
"value": 1.0
212+
}
213+
},
214+
{
215+
"key_as_string": "2019-01-03T00:00:00.000Z",
216+
"key": 1546473600000,
217+
"doc_count": 3,
218+
"distinct_users": {
219+
"value": 3
220+
},
221+
"total_new_users": {
222+
"value": 4
223+
},
224+
"incremental_new_users": {
225+
"value": 1.0
226+
}
227+
}
228+
]
229+
}
230+
}
231+
}
232+
--------------------------------------------------
233+
// TESTRESPONSE[s/"took": 11/"took": $body.took/]
234+
// TESTRESPONSE[s/"_shards": \.\.\./"_shards": $body._shards/]
235+
// TESTRESPONSE[s/"hits": \.\.\./"hits": $body.hits/]

docs/reference/rest-api/info.asciidoc

+4
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ Example response:
7171
"available" : true,
7272
"enabled" : true
7373
},
74+
"data_science" : {
75+
"available" : true,
76+
"enabled" : true
77+
},
7478
"flattened" : {
7579
"available" : true,
7680
"enabled" : true

server/src/main/java/org/elasticsearch/search/aggregations/metrics/InternalCardinality.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ public long getValue() {
8080
return counts == null ? 0 : counts.cardinality(0);
8181
}
8282

83-
HyperLogLogPlusPlus getCounts() {
83+
public HyperLogLogPlusPlus getCounts() {
8484
return counts;
8585
}
8686

x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java

+12
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ public class XPackLicenseState {
7070
"Creating and Starting rollup jobs will no longer be allowed.",
7171
"Stopping/Deleting existing jobs, RollupCaps API and RollupSearch continue to function."
7272
});
73+
messages.put(XPackField.DATA_SCIENCE, new String[] {
74+
"Aggregations provided by Data Science plugin are no longer usable."
75+
});
7376
EXPIRATION_MESSAGES = Collections.unmodifiableMap(messages);
7477
}
7578

@@ -744,6 +747,15 @@ public boolean isSpatialAllowed() {
744747
return localStatus.active;
745748
}
746749

750+
/**
751+
* Datascience is always available as long as there is a valid license
752+
*
753+
* @return true if the license is active
754+
*/
755+
public synchronized boolean isDataScienceAllowed() {
756+
return status.active;
757+
}
758+
747759
public synchronized boolean isTrialLicense() {
748760
return status.mode == OperationMode.TRIAL;
749761
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformState;
5050
import org.elasticsearch.xpack.core.dataframe.transforms.SyncConfig;
5151
import org.elasticsearch.xpack.core.dataframe.transforms.TimeSyncConfig;
52+
import org.elasticsearch.xpack.core.datascience.DataScienceFeatureSetUsage;
5253
import org.elasticsearch.xpack.core.deprecation.DeprecationInfoAction;
5354
import org.elasticsearch.xpack.core.flattened.FlattenedFeatureSetUsage;
5455
import org.elasticsearch.xpack.core.frozen.FrozenIndicesFeatureSetUsage;
@@ -509,7 +510,9 @@ public List<NamedWriteableRegistry.Entry> getNamedWriteables() {
509510
// Frozen indices
510511
new NamedWriteableRegistry.Entry(XPackFeatureSet.Usage.class, XPackField.FROZEN_INDICES, FrozenIndicesFeatureSetUsage::new),
511512
// Spatial
512-
new NamedWriteableRegistry.Entry(XPackFeatureSet.Usage.class, XPackField.SPATIAL, SpatialFeatureSetUsage::new)
513+
new NamedWriteableRegistry.Entry(XPackFeatureSet.Usage.class, XPackField.SPATIAL, SpatialFeatureSetUsage::new),
514+
// data science
515+
new NamedWriteableRegistry.Entry(XPackFeatureSet.Usage.class, XPackField.DATA_SCIENCE, DataScienceFeatureSetUsage::new)
513516
);
514517
}
515518

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackField.java

+2
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ public final class XPackField {
4747
public static final String FROZEN_INDICES = "frozen_indices";
4848
/** Name constant for spatial features. */
4949
public static final String SPATIAL = "spatial";
50+
/** Name constant for the data science plugin. */
51+
public static final String DATA_SCIENCE = "data_science";
5052

5153
private XPackField() {}
5254

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackSettings.java

+4
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ private XPackSettings() {
121121
/** Setting for enabling or disabling vectors. Defaults to true. */
122122
public static final Setting<Boolean> VECTORS_ENABLED = Setting.boolSetting("xpack.vectors.enabled", true, Setting.Property.NodeScope);
123123

124+
/** Setting for enabling or disabling data science plugin. Defaults to true. */
125+
public static final Setting<Boolean> DATA_SCIENCE_ENABLED = Setting.boolSetting("xpack.datascience.enabled",
126+
true, Setting.Property.NodeScope);
127+
124128
/*
125129
* SSL settings. These are the settings that are specifically registered for SSL. Many are private as we do not explicitly use them
126130
* but instead parse based on a prefix (eg *.ssl.*)

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackInfoFeatureAction.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,11 @@ public class XPackInfoFeatureAction extends ActionType<XPackInfoFeatureResponse>
3838
public static final XPackInfoFeatureAction VOTING_ONLY = new XPackInfoFeatureAction(XPackField.VOTING_ONLY);
3939
public static final XPackInfoFeatureAction FROZEN_INDICES = new XPackInfoFeatureAction(XPackField.FROZEN_INDICES);
4040
public static final XPackInfoFeatureAction SPATIAL = new XPackInfoFeatureAction(XPackField.SPATIAL);
41+
public static final XPackInfoFeatureAction DATA_SCIENCE = new XPackInfoFeatureAction(XPackField.DATA_SCIENCE);
4142

4243
public static final List<XPackInfoFeatureAction> ALL = Arrays.asList(
4344
SECURITY, MONITORING, WATCHER, GRAPH, MACHINE_LEARNING, LOGSTASH, SQL, ROLLUP, INDEX_LIFECYCLE, CCR, DATA_FRAME, FLATTENED,
44-
VECTORS, VOTING_ONLY, FROZEN_INDICES, SPATIAL
45+
VECTORS, VOTING_ONLY, FROZEN_INDICES, SPATIAL, DATA_SCIENCE
4546
);
4647

4748
private XPackInfoFeatureAction(String name) {

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/action/XPackUsageFeatureAction.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,11 @@ public class XPackUsageFeatureAction extends ActionType<XPackUsageFeatureRespons
3838
public static final XPackUsageFeatureAction VOTING_ONLY = new XPackUsageFeatureAction(XPackField.VOTING_ONLY);
3939
public static final XPackUsageFeatureAction FROZEN_INDICES = new XPackUsageFeatureAction(XPackField.FROZEN_INDICES);
4040
public static final XPackUsageFeatureAction SPATIAL = new XPackUsageFeatureAction(XPackField.SPATIAL);
41+
public static final XPackUsageFeatureAction DATA_SCIENCE = new XPackUsageFeatureAction(XPackField.DATA_SCIENCE);
4142

4243
public static final List<XPackUsageFeatureAction> ALL = Arrays.asList(
4344
SECURITY, MONITORING, WATCHER, GRAPH, MACHINE_LEARNING, LOGSTASH, SQL, ROLLUP, INDEX_LIFECYCLE, CCR, DATA_FRAME, FLATTENED,
44-
VECTORS, VOTING_ONLY, FROZEN_INDICES, SPATIAL
45+
VECTORS, VOTING_ONLY, FROZEN_INDICES, SPATIAL, DATA_SCIENCE
4546
);
4647

4748
private XPackUsageFeatureAction(String name) {

0 commit comments

Comments
 (0)