Skip to content

Commit 4a280a6

Browse files
committed
Add Cumulative Cardinality agg (and Data Science plugin) (elastic#43661)
This adds a pipeline aggregation that calculates the cumulative cardinality of a field. It does this by iteratively merging in the HLL sketch from consecutive buckets and emitting the cardinality up to that point. This is useful for things like finding the total "new" users that have visited a website (as opposed to "repeat" visitors). This is a Basic+ aggregation and adds a new Data Science plugin to house it and future advanced analytics/data science aggregations.
1 parent 5fbb572 commit 4a280a6

File tree

26 files changed

+1580
-2
lines changed

26 files changed

+1580
-2
lines changed

distribution/build.gradle

+1
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,7 @@ task run(type: RunTask) {
425425
setting 'xpack.monitoring.enabled', 'true'
426426
setting 'xpack.sql.enabled', 'true'
427427
setting 'xpack.rollup.enabled', 'true'
428+
setting 'xpack.data-science.enabled', 'true'
428429
keystoreSetting 'bootstrap.password', 'password'
429430
}
430431
}

docs/build.gradle

+36
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,42 @@ buildRestTests.setups['sales'] = '''
218218
{"index":{}}
219219
{"date": "2015/03/01 00:00:00", "price": 175, "promoted": false, "rating": 2, "type": "t-shirt"}'''
220220

221+
// Used by cumulative cardinality aggregation docs
222+
buildRestTests.setups['user_hits'] = '''
223+
- do:
224+
indices.create:
225+
index: user_hits
226+
body:
227+
settings:
228+
number_of_shards: 1
229+
number_of_replicas: 0
230+
mappings:
231+
properties:
232+
user_id:
233+
type: keyword
234+
timestamp:
235+
type: date
236+
- do:
237+
bulk:
238+
index: user_hits
239+
refresh: true
240+
body: |
241+
{"index":{}}
242+
{"timestamp": "2019-01-01T13:00:00", "user_id": "1"}
243+
{"index":{}}
244+
{"timestamp": "2019-01-01T13:00:00", "user_id": "2"}
245+
{"index":{}}
246+
{"timestamp": "2019-01-02T13:00:00", "user_id": "1"}
247+
{"index":{}}
248+
{"timestamp": "2019-01-02T13:00:00", "user_id": "3"}
249+
{"index":{}}
250+
{"timestamp": "2019-01-03T13:00:00", "user_id": "1"}
251+
{"index":{}}
252+
{"timestamp": "2019-01-03T13:00:00", "user_id": "2"}
253+
{"index":{}}
254+
{"timestamp": "2019-01-03T13:00:00", "user_id": "4"}'''
255+
256+
221257
// Dummy bank account data used by getting-started.asciidoc
222258
buildRestTests.setups['bank'] = '''
223259
- do:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
[role="xpack"]
2+
[testenv="basic"]
3+
[[search-aggregations-pipeline-cumulative-cardinality-aggregation]]
4+
=== Cumulative Cardinality Aggregation
5+
6+
A parent pipeline aggregation which calculates the Cumulative Cardinality in a parent histogram (or date_histogram)
7+
aggregation. The specified metric must be a cardinality aggregation and the enclosing histogram
8+
must have `min_doc_count` set to `0` (default for `histogram` aggregations).
9+
10+
The `cumulative_cardinality` agg is useful for finding "total new items", like the number of new visitors to your
11+
website each day. A regular cardinality aggregation will tell you how many unique visitors came each day, but doesn't
12+
differentiate between "new" or "repeat" visitors. The Cumulative Cardinality aggregation can be used to determine
13+
how many of each day's unique visitors are "new".
14+
15+
==== Syntax
16+
17+
A `cumulative_cardinality` aggregation looks like this in isolation:
18+
19+
[source,js]
20+
--------------------------------------------------
21+
{
22+
"cumulative_cardinality": {
23+
"buckets_path": "my_cardinality_agg"
24+
}
25+
}
26+
--------------------------------------------------
27+
// NOTCONSOLE
28+
29+
[[cumulative-cardinality-params]]
30+
.`cumulative_cardinality` Parameters
31+
[options="header"]
32+
|===
33+
|Parameter Name |Description |Required |Default Value
34+
|`buckets_path` |The path to the cardinality aggregation we wish to find the cumulative cardinality for (see <<buckets-path-syntax>> for more
35+
details) |Required |
36+
|`format` |format to apply to the output value of this aggregation |Optional |`null`
37+
|===
38+
39+
The following snippet calculates the cumulative cardinality of the total daily `users`:
40+
41+
[source,js]
42+
--------------------------------------------------
43+
GET /user_hits/_search
44+
{
45+
"size": 0,
46+
"aggs" : {
47+
"users_per_day" : {
48+
"date_histogram" : {
49+
"field" : "timestamp",
50+
"calendar_interval" : "day"
51+
},
52+
"aggs": {
53+
"distinct_users": {
54+
"cardinality": {
55+
"field": "user_id"
56+
}
57+
},
58+
"total_new_users": {
59+
"cumulative_cardinality": {
60+
"buckets_path": "distinct_users" <1>
61+
}
62+
}
63+
}
64+
}
65+
}
66+
}
67+
--------------------------------------------------
68+
// CONSOLE
69+
// TEST[setup:user_hits]
70+
71+
<1> `buckets_path` instructs this aggregation to use the output of the `distinct_users` aggregation for the cumulative cardinality
72+
73+
And the following may be the response:
74+
75+
[source,js]
76+
--------------------------------------------------
77+
{
78+
"took": 11,
79+
"timed_out": false,
80+
"_shards": ...,
81+
"hits": ...,
82+
"aggregations": {
83+
"users_per_day": {
84+
"buckets": [
85+
{
86+
"key_as_string": "2019-01-01T00:00:00.000Z",
87+
"key": 1546300800000,
88+
"doc_count": 2,
89+
"distinct_users": {
90+
"value": 2
91+
},
92+
"total_new_users": {
93+
"value": 2
94+
}
95+
},
96+
{
97+
"key_as_string": "2019-01-02T00:00:00.000Z",
98+
"key": 1546387200000,
99+
"doc_count": 2,
100+
"distinct_users": {
101+
"value": 2
102+
},
103+
"total_new_users": {
104+
"value": 3
105+
}
106+
},
107+
{
108+
"key_as_string": "2019-01-03T00:00:00.000Z",
109+
"key": 1546473600000,
110+
"doc_count": 3,
111+
"distinct_users": {
112+
"value": 3
113+
},
114+
"total_new_users": {
115+
"value": 4
116+
}
117+
}
118+
]
119+
}
120+
}
121+
}
122+
--------------------------------------------------
123+
// TESTRESPONSE[s/"took": 11/"took": $body.took/]
124+
// TESTRESPONSE[s/"_shards": \.\.\./"_shards": $body._shards/]
125+
// TESTRESPONSE[s/"hits": \.\.\./"hits": $body.hits/]
126+
127+
128+
Note how the second day, `2019-01-02`, has two distinct users but the `total_new_users` metric generated by the
129+
cumulative pipeline agg only increments to three. This means that only one of the two users that day were
130+
new, the other had already been seen in the previous day. This happens again on the third day, where only
131+
one of three users is completely new.
132+
133+
==== Incremental cumulative cardinality
134+
135+
The `cumulative_cardinality` agg will show you the total, distinct count since the beginning of the time period
136+
being queried. Sometimes, however, it is useful to see the "incremental" count. Meaning, how many new users
137+
are added each day, rather than the total cumulative count.
138+
139+
This can be accomplished by adding a `derivative` aggregation to our query:
140+
141+
[source,js]
142+
--------------------------------------------------
143+
GET /user_hits/_search
144+
{
145+
"size": 0,
146+
"aggs" : {
147+
"users_per_day" : {
148+
"date_histogram" : {
149+
"field" : "timestamp",
150+
"calendar_interval" : "day"
151+
},
152+
"aggs": {
153+
"distinct_users": {
154+
"cardinality": {
155+
"field": "user_id"
156+
}
157+
},
158+
"total_new_users": {
159+
"cumulative_cardinality": {
160+
"buckets_path": "distinct_users"
161+
}
162+
},
163+
"incremental_new_users": {
164+
"derivative": {
165+
"buckets_path": "total_new_users"
166+
}
167+
}
168+
}
169+
}
170+
}
171+
}
172+
--------------------------------------------------
173+
// CONSOLE
174+
// TEST[setup:user_hits]
175+
176+
177+
And the following may be the response:
178+
179+
[source,js]
180+
--------------------------------------------------
181+
{
182+
"took": 11,
183+
"timed_out": false,
184+
"_shards": ...,
185+
"hits": ...,
186+
"aggregations": {
187+
"users_per_day": {
188+
"buckets": [
189+
{
190+
"key_as_string": "2019-01-01T00:00:00.000Z",
191+
"key": 1546300800000,
192+
"doc_count": 2,
193+
"distinct_users": {
194+
"value": 2
195+
},
196+
"total_new_users": {
197+
"value": 2
198+
}
199+
},
200+
{
201+
"key_as_string": "2019-01-02T00:00:00.000Z",
202+
"key": 1546387200000,
203+
"doc_count": 2,
204+
"distinct_users": {
205+
"value": 2
206+
},
207+
"total_new_users": {
208+
"value": 3
209+
},
210+
"incremental_new_users": {
211+
"value": 1.0
212+
}
213+
},
214+
{
215+
"key_as_string": "2019-01-03T00:00:00.000Z",
216+
"key": 1546473600000,
217+
"doc_count": 3,
218+
"distinct_users": {
219+
"value": 3
220+
},
221+
"total_new_users": {
222+
"value": 4
223+
},
224+
"incremental_new_users": {
225+
"value": 1.0
226+
}
227+
}
228+
]
229+
}
230+
}
231+
}
232+
--------------------------------------------------
233+
// TESTRESPONSE[s/"took": 11/"took": $body.took/]
234+
// TESTRESPONSE[s/"_shards": \.\.\./"_shards": $body._shards/]
235+
// TESTRESPONSE[s/"hits": \.\.\./"hits": $body.hits/]

docs/reference/rest-api/info.asciidoc

+4
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ Example response:
7171
"available" : true,
7272
"enabled" : true
7373
},
74+
"data_science" : {
75+
"available" : true,
76+
"enabled" : true
77+
},
7478
"flattened" : {
7579
"available" : true,
7680
"enabled" : true

server/src/main/java/org/elasticsearch/search/aggregations/metrics/InternalCardinality.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ public long getValue() {
8080
return counts == null ? 0 : counts.cardinality(0);
8181
}
8282

83-
HyperLogLogPlusPlus getCounts() {
83+
public HyperLogLogPlusPlus getCounts() {
8484
return counts;
8585
}
8686

x-pack/plugin/core/src/main/java/org/elasticsearch/license/XPackLicenseState.java

+12
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ public class XPackLicenseState {
7070
"Creating and Starting rollup jobs will no longer be allowed.",
7171
"Stopping/Deleting existing jobs, RollupCaps API and RollupSearch continue to function."
7272
});
73+
messages.put(XPackField.DATA_SCIENCE, new String[] {
74+
"Aggregations provided by Data Science plugin are no longer usable."
75+
});
7376
EXPIRATION_MESSAGES = Collections.unmodifiableMap(messages);
7477
}
7578

@@ -744,6 +747,15 @@ public boolean isSpatialAllowed() {
744747
return localStatus.active;
745748
}
746749

750+
/**
751+
* Datascience is always available as long as there is a valid license
752+
*
753+
* @return true if the license is active
754+
*/
755+
public synchronized boolean isDataScienceAllowed() {
756+
return status.active;
757+
}
758+
747759
public synchronized boolean isTrialLicense() {
748760
return status.mode == OperationMode.TRIAL;
749761
}

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
import org.elasticsearch.xpack.core.dataframe.transforms.DataFrameTransformState;
5656
import org.elasticsearch.xpack.core.dataframe.transforms.SyncConfig;
5757
import org.elasticsearch.xpack.core.dataframe.transforms.TimeSyncConfig;
58+
import org.elasticsearch.xpack.core.datascience.DataScienceFeatureSetUsage;
5859
import org.elasticsearch.xpack.core.deprecation.DeprecationInfoAction;
5960
import org.elasticsearch.xpack.core.flattened.FlattenedFeatureSetUsage;
6061
import org.elasticsearch.xpack.core.frozen.FrozenIndicesFeatureSetUsage;
@@ -544,7 +545,9 @@ public List<NamedWriteableRegistry.Entry> getNamedWriteables() {
544545
// Frozen indices
545546
new NamedWriteableRegistry.Entry(XPackFeatureSet.Usage.class, XPackField.FROZEN_INDICES, FrozenIndicesFeatureSetUsage::new),
546547
// Spatial
547-
new NamedWriteableRegistry.Entry(XPackFeatureSet.Usage.class, XPackField.SPATIAL, SpatialFeatureSetUsage::new)
548+
new NamedWriteableRegistry.Entry(XPackFeatureSet.Usage.class, XPackField.SPATIAL, SpatialFeatureSetUsage::new),
549+
// data science
550+
new NamedWriteableRegistry.Entry(XPackFeatureSet.Usage.class, XPackField.DATA_SCIENCE, DataScienceFeatureSetUsage::new)
548551
);
549552
}
550553

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackField.java

+2
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ public final class XPackField {
4747
public static final String FROZEN_INDICES = "frozen_indices";
4848
/** Name constant for spatial features. */
4949
public static final String SPATIAL = "spatial";
50+
/** Name constant for the data science plugin. */
51+
public static final String DATA_SCIENCE = "data_science";
5052

5153
private XPackField() {}
5254

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackSettings.java

+5
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@ private XPackSettings() {
126126
/** Setting for enabling or disabling vectors. Defaults to true. */
127127
public static final Setting<Boolean> VECTORS_ENABLED = Setting.boolSetting("xpack.vectors.enabled", true, Setting.Property.NodeScope);
128128

129+
/** Setting for enabling or disabling data science plugin. Defaults to true. */
130+
public static final Setting<Boolean> DATA_SCIENCE_ENABLED = Setting.boolSetting("xpack.data-science.enabled",
131+
true, Setting.Property.NodeScope);
132+
129133
public static final List<String> DEFAULT_SUPPORTED_PROTOCOLS;
130134

131135
static {
@@ -258,6 +262,7 @@ public static List<Setting<?>> getAllSettings() {
258262
settings.add(DATA_FRAME_ENABLED);
259263
settings.add(FLATTENED_ENABLED);
260264
settings.add(VECTORS_ENABLED);
265+
settings.add(DATA_SCIENCE_ENABLED);
261266
return Collections.unmodifiableList(settings);
262267
}
263268

0 commit comments

Comments
 (0)