Skip to content

Commit 26df354

Browse files
committed
Implement Rare Terms aggregation
Relates: #4001 This commit implements the Rare terms aggregations. Rare term buckets only expose key and doc_count, so a new RareTermsBucket<TKey> type is used.
1 parent ee84e1a commit 26df354

File tree

7 files changed

+266
-0
lines changed

7 files changed

+266
-0
lines changed

src/Nest/Aggregations/AggregateDictionary.cs

+26
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,20 @@ public TermsAggregate<TKey> Terms<TKey>(string key)
178178

179179
public MultiBucketAggregate<KeyedBucket<string>> AdjacencyMatrix(string key) => GetMultiKeyedBucketAggregate<string>(key);
180180

181+
public MultiBucketAggregate<RareTermsBucket<TKey>> RareTerms<TKey>(string key)
182+
{
183+
var bucket = TryGet<BucketAggregate>(key);
184+
return bucket == null
185+
? null
186+
: new MultiBucketAggregate<RareTermsBucket<TKey>>
187+
{
188+
Buckets = GetRareTermsBuckets<TKey>(bucket.Items).ToList(),
189+
Meta = bucket.Meta
190+
};
191+
}
192+
193+
public MultiBucketAggregate<RareTermsBucket<string>> RareTerms(string key) => RareTerms<string>(key);
194+
181195
public MultiBucketAggregate<RangeBucket> Range(string key) => GetMultiBucketAggregate<RangeBucket>(key);
182196

183197
public MultiBucketAggregate<RangeBucket> DateRange(string key) => GetMultiBucketAggregate<RangeBucket>(key);
@@ -275,5 +289,17 @@ private IEnumerable<SignificantTermsBucket<TKey>> GetSignificantTermsBuckets<TKe
275289
Score = bucket.Score
276290
};
277291
}
292+
293+
private IEnumerable<RareTermsBucket<TKey>> GetRareTermsBuckets<TKey>(IEnumerable<IBucket> items)
294+
{
295+
var buckets = items.Cast<KeyedBucket<object>>();
296+
297+
foreach (var bucket in buckets)
298+
yield return new RareTermsBucket<TKey>(bucket.BackingDictionary)
299+
{
300+
Key = (TKey)Convert.ChangeType(bucket.Key, typeof(TKey)),
301+
DocCount = bucket.DocCount.GetValueOrDefault(0)
302+
};
303+
}
278304
}
279305
}

src/Nest/Aggregations/AggregationContainer.cs

+12
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ public interface IAggregationContainer
208208
[DataMember(Name = "range")]
209209
IRangeAggregation Range { get; set; }
210210

211+
[DataMember(Name = "rare_terms")]
212+
IRareTermsAggregation RareTerms { get; set; }
213+
211214
[DataMember(Name = "reverse_nested")]
212215
IReverseNestedAggregation ReverseNested { get; set; }
213216

@@ -339,6 +342,8 @@ public class AggregationContainer : IAggregationContainer
339342

340343
public IRangeAggregation Range { get; set; }
341344

345+
public IRareTermsAggregation RareTerms { get; set; }
346+
342347
public IReverseNestedAggregation ReverseNested { get; set; }
343348

344349
public ISamplerAggregation Sampler { get; set; }
@@ -481,6 +486,8 @@ public class AggregationContainerDescriptor<T> : DescriptorBase<AggregationConta
481486

482487
IRangeAggregation IAggregationContainer.Range { get; set; }
483488

489+
IRareTermsAggregation IAggregationContainer.RareTerms { get; set; }
490+
484491
IReverseNestedAggregation IAggregationContainer.ReverseNested { get; set; }
485492

486493
ISamplerAggregation IAggregationContainer.Sampler { get; set; }
@@ -638,6 +645,11 @@ Func<RangeAggregationDescriptor<T>, IRangeAggregation> selector
638645
) =>
639646
_SetInnerAggregation(name, selector, (a, d) => a.Range = d);
640647

648+
public AggregationContainerDescriptor<T> RareTerms(string name,
649+
Func<RareTermsAggregationDescriptor<T>, IRareTermsAggregation> selector
650+
) =>
651+
_SetInnerAggregation(name, selector, (a, d) => a.RareTerms = d);
652+
641653
public AggregationContainerDescriptor<T> Stats(string name,
642654
Func<StatsAggregationDescriptor<T>, IStatsAggregation> selector
643655
) =>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq.Expressions;
4+
using System.Runtime.Serialization;
5+
using Elasticsearch.Net.Utf8Json;
6+
7+
namespace Nest
8+
{
9+
/// <summary>
10+
/// A multi-bucket value source based aggregation which finds "rare" terms — terms that are at the long-tail of the distribution
11+
/// and are not frequent. Conceptually, this is like a terms aggregation that is sorted by _count ascending.
12+
/// </summary>
13+
[InterfaceDataContract]
14+
[ReadAs(typeof(RareTermsAggregation))]
15+
public interface IRareTermsAggregation : IBucketAggregation
16+
{
17+
/// <summary>
18+
/// Terms that should be excluded from the aggregation
19+
/// </summary>
20+
[DataMember(Name = "exclude")]
21+
TermsExclude Exclude { get; set; }
22+
23+
/// <summary>
24+
/// The field to find rare terms in
25+
/// </summary>
26+
[DataMember(Name = "field")]
27+
Field Field { get; set; }
28+
29+
/// <summary>
30+
/// Terms that should be included in the aggregation
31+
/// </summary>
32+
[DataMember(Name = "include")]
33+
TermsInclude Include { get; set; }
34+
35+
/// <summary>
36+
/// The maximum number of documents a term should appear in.
37+
/// Defaults to <c>1</c>
38+
/// </summary>
39+
[DataMember(Name = "max_doc_count")]
40+
long? MaximumDocumentCount { get; set; }
41+
42+
/// <summary>
43+
/// The value that should be used if a document does not have the field being aggregated
44+
/// </summary>
45+
[DataMember(Name = "missing")]
46+
object Missing { get; set; }
47+
48+
/// <summary>
49+
/// The precision of the internal CuckooFilters. Smaller precision leads to better approximation,
50+
/// but higher memory usage. Cannot be smaller than 0.00001. Defaults to 0.01
51+
/// </summary>
52+
[DataMember(Name = "precision")]
53+
double? Precision { get; set; }
54+
}
55+
56+
/// <inheritdoc cref="IRareTermsAggregation"/>
57+
public class RareTermsAggregation : BucketAggregationBase, IRareTermsAggregation
58+
{
59+
internal RareTermsAggregation() { }
60+
61+
public RareTermsAggregation(string name) : base(name) { }
62+
63+
/// <inheritdoc />
64+
public TermsExclude Exclude { get; set; }
65+
/// <inheritdoc />
66+
public Field Field { get; set; }
67+
/// <inheritdoc />
68+
public TermsInclude Include { get; set; }
69+
/// <inheritdoc />
70+
public long? MaximumDocumentCount { get; set; }
71+
/// <inheritdoc />
72+
public object Missing { get; set; }
73+
/// <inheritdoc />
74+
public double? Precision { get; set; }
75+
76+
internal override void WrapInContainer(AggregationContainer c) => c.RareTerms = this;
77+
}
78+
79+
/// <inheritdoc cref="IRareTermsAggregation"/>
80+
public class RareTermsAggregationDescriptor<T>
81+
: BucketAggregationDescriptorBase<RareTermsAggregationDescriptor<T>, IRareTermsAggregation, T>, IRareTermsAggregation
82+
where T : class
83+
{
84+
TermsExclude IRareTermsAggregation.Exclude { get; set; }
85+
Field IRareTermsAggregation.Field { get; set; }
86+
TermsInclude IRareTermsAggregation.Include { get; set; }
87+
long? IRareTermsAggregation.MaximumDocumentCount { get; set; }
88+
object IRareTermsAggregation.Missing { get; set; }
89+
double? IRareTermsAggregation.Precision { get; set; }
90+
91+
/// <inheritdoc cref="IRareTermsAggregation.Field" />
92+
public RareTermsAggregationDescriptor<T> Field(Field field) => Assign(field, (a, v) => a.Field = v);
93+
94+
/// <inheritdoc cref="IRareTermsAggregation.Field" />
95+
public RareTermsAggregationDescriptor<T> Field<TValue>(Expression<Func<T, TValue>> field) => Assign(field, (a, v) => a.Field = v);
96+
97+
/// <inheritdoc cref="IRareTermsAggregation.MaximumDocumentCount" />
98+
public RareTermsAggregationDescriptor<T> MaximumDocumentCount(long? maximumDocumentCount) =>
99+
Assign(maximumDocumentCount, (a, v) => a.MaximumDocumentCount = v);
100+
101+
/// <inheritdoc cref="IRareTermsAggregation.Include" />
102+
public RareTermsAggregationDescriptor<T> Include(long partition, long numberOfPartitions) =>
103+
Assign(new TermsInclude(partition, numberOfPartitions), (a, v) => a.Include = v);
104+
105+
/// <inheritdoc cref="IRareTermsAggregation.Include" />
106+
public RareTermsAggregationDescriptor<T> Include(string includePattern) =>
107+
Assign(new TermsInclude(includePattern), (a, v) => a.Include = v);
108+
109+
/// <inheritdoc cref="IRareTermsAggregation.Include" />
110+
public RareTermsAggregationDescriptor<T> Include(IEnumerable<string> values) =>
111+
Assign(new TermsInclude(values), (a, v) => a.Include = v);
112+
113+
/// <inheritdoc cref="IRareTermsAggregation.Exclude" />
114+
public RareTermsAggregationDescriptor<T> Exclude(string excludePattern) =>
115+
Assign(new TermsExclude(excludePattern), (a, v) => a.Exclude = v);
116+
117+
/// <inheritdoc cref="IRareTermsAggregation.Exclude" />
118+
public RareTermsAggregationDescriptor<T> Exclude(IEnumerable<string> values) =>
119+
Assign(new TermsExclude(values), (a, v) => a.Exclude = v);
120+
121+
/// <inheritdoc cref="IRareTermsAggregation.Missing" />
122+
public RareTermsAggregationDescriptor<T> Missing(object missing) => Assign(missing, (a, v) => a.Missing = v);
123+
124+
/// <inheritdoc cref="IRareTermsAggregation.Precision" />
125+
public RareTermsAggregationDescriptor<T> Precision(double? precision) => Assign(precision, (a, v) => a.Precision = v);
126+
}
127+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
using System.Collections.Generic;
2+
3+
namespace Nest
4+
{
5+
public class RareTermsBucket<TKey> : BucketBase
6+
{
7+
public RareTermsBucket(IReadOnlyDictionary<string, IAggregate> dict) : base(dict) { }
8+
9+
public long DocCount { get; set; }
10+
11+
public TKey Key { get; set; }
12+
}
13+
}

src/Nest/Aggregations/Visitor/AggregationVisitor.cs

+4
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ public interface IAggregationVisitor
7474

7575
void Visit(IRangeAggregation aggregation);
7676

77+
void Visit(IRareTermsAggregation aggregation);
78+
7779
void Visit(ITermsAggregation aggregation);
7880

7981
void Visit(ISignificantTermsAggregation aggregation);
@@ -201,6 +203,8 @@ public virtual void Visit(ISignificantTermsAggregation aggregation) { }
201203

202204
public virtual void Visit(IRangeAggregation aggregation) { }
203205

206+
public virtual void Visit(IRareTermsAggregation aggregation) { }
207+
204208
public virtual void Visit(INestedAggregation aggregation) { }
205209

206210
public virtual void Visit(IParentAggregation aggregation) { }

src/Nest/Aggregations/Visitor/AggregationWalker.cs

+5
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,11 @@ public void Walk(IAggregationContainer aggregation, IAggregationVisitor visitor)
128128
v.Visit(d);
129129
Accept(v, d.Aggregations);
130130
});
131+
AcceptAggregation(aggregation.RareTerms, visitor, (v, d) =>
132+
{
133+
v.Visit(d);
134+
Accept(v, d.Aggregations);
135+
});
131136
AcceptAggregation(aggregation.ReverseNested, visitor, (v, d) =>
132137
{
133138
v.Visit(d);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using FluentAssertions;
4+
using Nest;
5+
using Tests.Core.Extensions;
6+
using Tests.Core.ManagedElasticsearch.Clusters;
7+
using Tests.Domain;
8+
using Tests.Framework.EndpointTests.TestState;
9+
10+
namespace Tests.Aggregations.Bucket.RareTerms
11+
{
12+
/**
13+
* A multi-bucket value source based aggregation which finds "rare" terms — terms that are at the long-tail of the
14+
* distribution and are not frequent. Conceptually, this is like a terms aggregation that is sorted by _count ascending.
15+
* As noted in the terms aggregation docs, actually ordering a terms agg by count ascending has unbounded error.
16+
* Instead, you should use the rare_terms aggregation
17+
*
18+
* See the Elasticsearch documentation on {ref_current}/search-aggregations-bucket-rare-terms-aggregation.html[rare terms aggregation] for more detail.
19+
*/
20+
public class RareTermsAggregationUsageTests : AggregationUsageTestBase
21+
{
22+
public RareTermsAggregationUsageTests(ReadOnlyCluster i, EndpointUsage usage) : base(i, usage) { }
23+
24+
protected override object AggregationJson => new
25+
{
26+
names = new
27+
{
28+
meta = new
29+
{
30+
foo = "bar"
31+
},
32+
rare_terms = new
33+
{
34+
field = "name",
35+
max_doc_count = 5,
36+
missing = "n/a",
37+
precision = 0.001
38+
}
39+
}
40+
};
41+
42+
protected override Func<AggregationContainerDescriptor<Project>, IAggregationContainer> FluentAggs => a => a
43+
.RareTerms("names", st => st
44+
.Field(p => p.Name)
45+
.Missing("n/a")
46+
.MaximumDocumentCount(5)
47+
.Precision(0.001)
48+
.Meta(m => m
49+
.Add("foo", "bar")
50+
)
51+
);
52+
53+
protected override AggregationDictionary InitializerAggs =>
54+
new RareTermsAggregation("names")
55+
{
56+
Field = Infer.Field<Project>(p => p.Name),
57+
MaximumDocumentCount = 5,
58+
Precision = 0.001,
59+
Missing = "n/a",
60+
Meta = new Dictionary<string, object> { { "foo", "bar" } }
61+
};
62+
63+
protected override void ExpectResponse(ISearchResponse<Project> response)
64+
{
65+
response.ShouldBeValid();
66+
var rareTerms = response.Aggregations.RareTerms("names");
67+
rareTerms.Should().NotBeNull();
68+
rareTerms.Buckets.Should().NotBeNull();
69+
rareTerms.Buckets.Count.Should().BeGreaterThan(0);
70+
foreach (var item in rareTerms.Buckets)
71+
{
72+
item.Key.Should().NotBeNullOrEmpty();
73+
item.DocCount.Should().BeGreaterOrEqualTo(1);
74+
}
75+
rareTerms.Meta.Should().NotBeNull().And.HaveCount(1);
76+
rareTerms.Meta["foo"].Should().Be("bar");
77+
}
78+
}
79+
}

0 commit comments

Comments
 (0)