Skip to content

Commit 511baca

Browse files
Add limits for ngram and shingle settings (#27411)
Create index-level settings: max_ngram_diff - maximum allowed difference between max_gram and min_gram in NGramTokenFilter/NGramTokenizer. Default is 1. max_shingle_diff - maximum allowed difference between max_shingle_size and min_shingle_size in ShingleTokenFilter. Default is 3. Log a warning when trying to create NGramTokenFilter, NGramTokenizer, ShingleTokenFilter where difference between max_size and min_size exceeds the settings's value. Closes #25887
1 parent 10ac2fd commit 511baca

File tree

15 files changed

+122
-0
lines changed

15 files changed

+122
-0
lines changed

core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
114114
IndexSettings.MAX_INNER_RESULT_WINDOW_SETTING,
115115
IndexSettings.MAX_DOCVALUE_FIELDS_SEARCH_SETTING,
116116
IndexSettings.MAX_SCRIPT_FIELDS_SETTING,
117+
IndexSettings.MAX_NGRAM_DIFF_SETTING,
118+
IndexSettings.MAX_SHINGLE_DIFF_SETTING,
117119
IndexSettings.MAX_RESCORE_WINDOW_SETTING,
118120
IndexSettings.MAX_ADJACENCY_MATRIX_FILTERS_SETTING,
119121
IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,

core/src/main/java/org/elasticsearch/index/IndexSettings.java

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,27 @@ public final class IndexSettings {
120120
*/
121121
public static final Setting<Integer> MAX_INNER_RESULT_WINDOW_SETTING =
122122
Setting.intSetting("index.max_inner_result_window", 100, 1, Property.Dynamic, Property.IndexScope);
123+
124+
/**
125+
* Index setting describing for NGramTokenizer and NGramTokenFilter
126+
* the maximum difference between
127+
* max_gram (maximum length of characters in a gram) and
128+
* min_gram (minimum length of characters in a gram).
129+
* The default value is 1 as this is default difference in NGramTokenizer,
130+
* and is defensive as it prevents generating too many index terms.
131+
*/
132+
public static final Setting<Integer> MAX_NGRAM_DIFF_SETTING =
133+
Setting.intSetting("index.max_ngram_diff", 1, 0, Property.Dynamic, Property.IndexScope);
134+
135+
/**
136+
* Index setting describing for ShingleTokenFilter
137+
* the maximum difference between
138+
* max_shingle_size and min_shingle_size.
139+
* The default value is 3 is defensive as it prevents generating too many tokens.
140+
*/
141+
public static final Setting<Integer> MAX_SHINGLE_DIFF_SETTING =
142+
Setting.intSetting("index.max_shingle_diff", 3, 0, Property.Dynamic, Property.IndexScope);
143+
123144
/**
124145
* Index setting describing the maximum value of allowed `docvalue_fields`that can be retrieved
125146
* per search request. The default maximum of 100 is defensive for the reason that retrieving
@@ -252,6 +273,8 @@ public final class IndexSettings {
252273
private volatile int maxRescoreWindow;
253274
private volatile int maxDocvalueFields;
254275
private volatile int maxScriptFields;
276+
private volatile int maxNgramDiff;
277+
private volatile int maxShingleDiff;
255278
private volatile boolean TTLPurgeDisabled;
256279
/**
257280
* The maximum number of refresh listeners allows on this shard.
@@ -355,6 +378,8 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
355378
maxRescoreWindow = scopedSettings.get(MAX_RESCORE_WINDOW_SETTING);
356379
maxDocvalueFields = scopedSettings.get(MAX_DOCVALUE_FIELDS_SEARCH_SETTING);
357380
maxScriptFields = scopedSettings.get(MAX_SCRIPT_FIELDS_SETTING);
381+
maxNgramDiff = scopedSettings.get(MAX_NGRAM_DIFF_SETTING);
382+
maxShingleDiff = scopedSettings.get(MAX_SHINGLE_DIFF_SETTING);
358383
TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
359384
maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
360385
maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
@@ -386,6 +411,8 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
386411
scopedSettings.addSettingsUpdateConsumer(MAX_RESCORE_WINDOW_SETTING, this::setMaxRescoreWindow);
387412
scopedSettings.addSettingsUpdateConsumer(MAX_DOCVALUE_FIELDS_SEARCH_SETTING, this::setMaxDocvalueFields);
388413
scopedSettings.addSettingsUpdateConsumer(MAX_SCRIPT_FIELDS_SETTING, this::setMaxScriptFields);
414+
scopedSettings.addSettingsUpdateConsumer(MAX_NGRAM_DIFF_SETTING, this::setMaxNgramDiff);
415+
scopedSettings.addSettingsUpdateConsumer(MAX_SHINGLE_DIFF_SETTING, this::setMaxShingleDiff);
389416
scopedSettings.addSettingsUpdateConsumer(INDEX_WARMER_ENABLED_SETTING, this::setEnableWarmer);
390417
scopedSettings.addSettingsUpdateConsumer(INDEX_GC_DELETES_SETTING, this::setGCDeletes);
391418
scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING, this::setTranslogFlushThresholdSize);
@@ -654,6 +681,20 @@ private void setMaxDocvalueFields(int maxDocvalueFields) {
654681
this.maxDocvalueFields = maxDocvalueFields;
655682
}
656683

684+
/**
685+
* Returns the maximum allowed difference between max and min length of ngram
686+
*/
687+
public int getMaxNgramDiff() { return this.maxNgramDiff; }
688+
689+
private void setMaxNgramDiff(int maxNgramDiff) { this.maxNgramDiff = maxNgramDiff; }
690+
691+
/**
692+
* Returns the maximum allowed difference between max and min shingle_size
693+
*/
694+
public int getMaxShingleDiff() { return this.maxShingleDiff; }
695+
696+
private void setMaxShingleDiff(int maxShingleDiff) { this.maxShingleDiff = maxShingleDiff; }
697+
657698
/**
658699
* Returns the maximum number of allowed script_fields to retrieve in a search request
659700
*/

core/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,14 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
8484

8585
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
8686
super(indexSettings, name, settings);
87+
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
8788
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
8889
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
90+
int ngramDiff = maxGram - minGram;
91+
if (ngramDiff > maxAllowedNgramDiff) {
92+
deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
93+
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
94+
}
8995
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
9096
}
9197

core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,17 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
3232

3333
public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3434
super(indexSettings, name, settings);
35+
int maxAllowedShingleDiff = indexSettings.getMaxShingleDiff();
3536
Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
3637
Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
3738
Boolean outputUnigrams = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "output_unigrams", true, deprecationLogger);
3839
Boolean outputUnigramsIfNoShingles = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "output_unigrams_if_no_shingles", false, deprecationLogger);
40+
41+
int shingleDiff = maxShingleSize - minShingleSize + (outputUnigrams ? 1 : 0);
42+
if (shingleDiff > maxAllowedShingleDiff) {
43+
deprecationLogger.deprecated("Deprecated big difference between maxShingleSize and minShingleSize in Shingle TokenFilter,"
44+
+ "expected difference must be less than or equal to: [" + maxAllowedShingleDiff + "]");
45+
}
3946
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
4047
String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
4148
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);

core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.elasticsearch.common.settings.Settings;
3131
import org.elasticsearch.common.xcontent.XContentFactory;
3232
import org.elasticsearch.common.xcontent.XContentType;
33+
import org.elasticsearch.index.IndexSettings;
3334
import org.elasticsearch.index.query.BoolQueryBuilder;
3435
import org.elasticsearch.index.query.MatchQueryBuilder;
3536
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
@@ -1802,6 +1803,7 @@ public void testSearchEmptyDoc() {
18021803
public void testNGramCopyField() {
18031804
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
18041805
.put(indexSettings())
1806+
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 9)
18051807
.put("index.analysis.analyzer.my_ngram_analyzer.type", "custom")
18061808
.put("index.analysis.analyzer.my_ngram_analyzer.tokenizer", "my_ngram_tokenizer")
18071809
.put("index.analysis.tokenizer.my_ngram_tokenizer.type", "nGram")

core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.elasticsearch.common.settings.Settings;
2929
import org.elasticsearch.common.xcontent.XContentBuilder;
3030
import org.elasticsearch.common.xcontent.XContentFactory;
31+
import org.elasticsearch.index.IndexSettings;
3132
import org.elasticsearch.plugins.Plugin;
3233
import org.elasticsearch.plugins.ScriptPlugin;
3334
import org.elasticsearch.script.ScriptContext;
@@ -683,6 +684,7 @@ public void testDifferentShardSize() throws Exception {
683684
public void testShardFailures() throws IOException, InterruptedException {
684685
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
685686
.put(indexSettings())
687+
.put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
686688
.put("index.analysis.analyzer.suggest.tokenizer", "standard")
687689
.putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
688690
.put("index.analysis.filter.shingler.type", "shingle")
@@ -743,6 +745,7 @@ public void testEmptyShards() throws IOException, InterruptedException {
743745
endObject();
744746
assertAcked(prepareCreate("test").setSettings(Settings.builder()
745747
.put(indexSettings())
748+
.put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
746749
.put("index.analysis.analyzer.suggest.tokenizer", "standard")
747750
.putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
748751
.put("index.analysis.filter.shingler.type", "shingle")

docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,6 @@ type:
1313
|`max_gram` |Defaults to `2`.
1414
|============================
1515

16+
The index level setting `index.max_ngram_diff` controls the maximum allowed
17+
difference between `max_gram` and `min_gram`.
18+

docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,5 @@ used if the position increment is greater than one when a `stop` filter is used
3838
together with the `shingle` filter. Defaults to `"_"`
3939
|=======================================================================
4040

41+
The index level setting `index.max_shingle_diff` controls the maximum allowed
42+
difference between `max_shingle_size` and `min_shingle_size`.

docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,9 @@ value. The smaller the length, the more documents will match but the lower
198198
the quality of the matches. The longer the length, the more specific the
199199
matches. A tri-gram (length `3`) is a good place to start.
200200

201+
The index level setting `index.max_ngram_diff` controls the maximum allowed
202+
difference between `max_gram` and `min_gram`.
203+
201204
[float]
202205
=== Example configuration
203206

docs/reference/index-modules.asciidoc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,16 @@ specific index module:
144144
The maximum number of `script_fields` that are allowed in a query.
145145
Defaults to `32`.
146146

147+
`index.max_ngram_diff`::
148+
149+
The maximum allowed difference between min_gram and max_gram for NGramTokenizer and NGramTokenFilter.
150+
Defaults to `1`.
151+
152+
`index.max_shingle_diff`::
153+
154+
The maximum allowed difference between max_shingle_size and min_shingle_size for ShingleTokenFilter.
155+
Defaults to `3`.
156+
147157
`index.blocks.read_only`::
148158

149159
Set to `true` to make the index and index metadata read only, `false` to

docs/reference/migration/migrate_6_0/indices.asciidoc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,19 @@ has been removed in index expressions.
7373
Translog files are now kept for up to 12 hours (by default), with a maximum size of `512mb` (default), and
7474
are no longer deleted on `flush`. This is to increase the chance of doing an operation based recovery when
7575
bringing up replicas up to speed.
76+
77+
==== Limit to the difference between max_size and min_size in NGramTokenFilter and NGramTokenizer
78+
79+
To safeguard against creating too many index terms, the difference between
80+
max_ngram and min_ngram in NGramTokenFilter and NGramTokenizer has been
81+
limited to 1. This default limit can be changed with the index setting
82+
index.max_ngram_diff. Note that if the limit is exceeded a deprecation
83+
warning is logged.
84+
85+
==== Limit to the difference between max_size and min_size in ShingleTokenFilter
86+
87+
To safeguard against creating too many tokens, the difference between
88+
max_shingle_size and min_shingle_size in ShingleTokenFilter has been
89+
limited to 3. This default limit can be changed with the index setting
90+
index.max_shingle_diff. Note that if the limit is exceeded a deprecation
91+
warning will be logged.

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
2828

2929

30+
3031
public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
3132

3233
private final int minGram;
@@ -36,8 +37,14 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
3637

3738
NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3839
super(indexSettings, name, settings);
40+
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
3941
this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
4042
this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
43+
int ngramDiff = maxGram - minGram;
44+
if (ngramDiff > maxAllowedNgramDiff) {
45+
deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
46+
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
47+
}
4148
}
4249

4350
@Override

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.elasticsearch.action.search.SearchResponse;
2323
import org.elasticsearch.common.settings.Settings;
24+
import org.elasticsearch.index.IndexSettings;
2425
import org.elasticsearch.index.query.Operator;
2526
import org.elasticsearch.plugins.Plugin;
2627
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
@@ -66,6 +67,7 @@ public void testNgramHighlightingWithBrokenPositions() throws IOException {
6667
.endObject())
6768
.setSettings(Settings.builder()
6869
.put(indexSettings())
70+
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 19)
6971
.put("analysis.tokenizer.autocomplete.max_gram", 20)
7072
.put("analysis.tokenizer.autocomplete.min_gram", 1)
7173
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")

modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,23 @@
2727
- match: { detail.tokenizer.tokens.2.token: od }
2828

2929
---
30+
"nGram_exception":
31+
- skip:
32+
version: " - 6.0.99"
33+
reason: starting from version 6.1.0 this produces a warning
34+
features: "warnings"
35+
- do:
36+
indices.analyze:
37+
body:
38+
text: good
39+
explain: true
40+
tokenizer:
41+
type: nGram
42+
min_gram: 2
43+
max_gram: 4
44+
warnings:
45+
- "Deprecated big difference between max_gram and min_gram in NGram Tokenizer,expected difference must be less than or equal to: [1]"
46+
---
3047
"simple_pattern":
3148
- do:
3249
indices.analyze:

modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
settings:
77
number_of_shards: 1
88
number_of_replicas: 0
9+
index.max_ngram_diff: 19
910
analysis:
1011
tokenizer:
1112
my_ngramt:

0 commit comments

Comments
 (0)