Skip to content

Commit 0234287

Browse files
Add limits for ngram and shingle settings
Create index-level settings: max_ngram_diff - maximum allowed difference between max_gram and min_gram in NGramTokenFilter/NGramTokenizer. Default is 1. max_shingle_diff - maximum allowed difference between max_shingle_size and min_shingle_size in ShingleTokenFilter. Default is 3. Throw an IllegalArgumentException when trying to create NGramTokenFilter, NGramTokenizer, ShingleTokenFilter where difference between max_size and min_size exceeds the settings value. Closes elastic#25887
1 parent 9abc26e commit 0234287

File tree

13 files changed

+161
-1
lines changed

13 files changed

+161
-1
lines changed

core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
113113
IndexSettings.MAX_INNER_RESULT_WINDOW_SETTING,
114114
IndexSettings.MAX_DOCVALUE_FIELDS_SEARCH_SETTING,
115115
IndexSettings.MAX_SCRIPT_FIELDS_SETTING,
116+
IndexSettings.MAX_NGRAM_DIFF_SETTING,
117+
IndexSettings.MAX_SHINGLE_DIFF_SETTING,
116118
IndexSettings.MAX_RESCORE_WINDOW_SETTING,
117119
IndexSettings.MAX_ADJACENCY_MATRIX_FILTERS_SETTING,
118120
IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,
@@ -150,6 +152,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
150152
EngineConfig.INDEX_CODEC_SETTING,
151153
EngineConfig.INDEX_OPTIMIZE_AUTO_GENERATED_IDS,
152154
IndexMetaData.SETTING_WAIT_FOR_ACTIVE_SHARDS,
155+
153156
// validate that built-in similarities don't get redefined
154157
Setting.groupSetting("index.similarity.", (s) -> {
155158
Map<String, Settings> groups = s.getAsGroups();

core/src/main/java/org/elasticsearch/index/IndexSettings.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,26 @@ public final class IndexSettings {
107107
public static final Setting<Integer> MAX_SCRIPT_FIELDS_SETTING =
108108
Setting.intSetting("index.max_script_fields", 32, 0, Property.Dynamic, Property.IndexScope);
109109

110+
/**
111+
* Index setting describing for NGramTokenizer and NGramTokenFilter
112+
* the maximum difference between
113+
* max_gram (maximum length of characters in a gram) and
114+
* min_gram (minimum length of characters in a gram).
115+
* The default value is 1 as this is default difference in NGramTokenizer,
116+
* and is defensive as it prevents generating too many index terms.
117+
*/
118+
public static final Setting<Integer> MAX_NGRAM_DIFF_SETTING =
119+
Setting.intSetting("index.max_ngram_diff", 1, 0, Property.Dynamic, Property.IndexScope);
120+
121+
/**
122+
* Index setting describing for ShingleTokenFilter
123+
* the maximum difference between
124+
* max_shingle_size and min_shingle_size.
125+
* The default value is 3 is defensive as it prevents generating too many tokens.
126+
*/
127+
public static final Setting<Integer> MAX_SHINGLE_DIFF_SETTING =
128+
Setting.intSetting("index.max_shingle_diff", 3, 0, Property.Dynamic, Property.IndexScope);
129+
110130
/**
111131
* Index setting describing the maximum value of allowed `docvalue_fields`that can be retrieved
112132
* per search request. The default maximum of 100 is defensive for the reason that retrieving
@@ -239,6 +259,8 @@ public final class IndexSettings {
239259
private volatile int maxRescoreWindow;
240260
private volatile int maxDocvalueFields;
241261
private volatile int maxScriptFields;
262+
private volatile int maxNgramDiff;
263+
private volatile int maxShingleDiff;
242264
private volatile boolean TTLPurgeDisabled;
243265
/**
244266
* The maximum number of refresh listeners allows on this shard.
@@ -342,6 +364,8 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
342364
maxRescoreWindow = scopedSettings.get(MAX_RESCORE_WINDOW_SETTING);
343365
maxDocvalueFields = scopedSettings.get(MAX_DOCVALUE_FIELDS_SEARCH_SETTING);
344366
maxScriptFields = scopedSettings.get(MAX_SCRIPT_FIELDS_SETTING);
367+
maxNgramDiff = scopedSettings.get(MAX_NGRAM_DIFF_SETTING);
368+
maxShingleDiff = scopedSettings.get(MAX_SHINGLE_DIFF_SETTING);
345369
TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
346370
maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
347371
maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
@@ -373,6 +397,8 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
373397
scopedSettings.addSettingsUpdateConsumer(MAX_RESCORE_WINDOW_SETTING, this::setMaxRescoreWindow);
374398
scopedSettings.addSettingsUpdateConsumer(MAX_DOCVALUE_FIELDS_SEARCH_SETTING, this::setMaxDocvalueFields);
375399
scopedSettings.addSettingsUpdateConsumer(MAX_SCRIPT_FIELDS_SETTING, this::setMaxScriptFields);
400+
scopedSettings.addSettingsUpdateConsumer(MAX_NGRAM_DIFF_SETTING, this::setMaxNgramDiff);
401+
scopedSettings.addSettingsUpdateConsumer(MAX_SHINGLE_DIFF_SETTING, this::setMaxShingleDiff);
376402
scopedSettings.addSettingsUpdateConsumer(INDEX_WARMER_ENABLED_SETTING, this::setEnableWarmer);
377403
scopedSettings.addSettingsUpdateConsumer(INDEX_GC_DELETES_SETTING, this::setGCDeletes);
378404
scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING, this::setTranslogFlushThresholdSize);
@@ -641,6 +667,20 @@ private void setMaxDocvalueFields(int maxDocvalueFields) {
641667
this.maxDocvalueFields = maxDocvalueFields;
642668
}
643669

670+
/**
671+
* Returns the maximum allowed difference between max and min length of ngram
672+
*/
673+
public int getMaxNgramDiff() { return this.maxNgramDiff; }
674+
675+
private void setMaxNgramDiff(int maxNgramDiff) { this.maxNgramDiff = maxNgramDiff; }
676+
677+
/**
678+
* Returns the maximum allowed difference between max and min shingle_size
679+
*/
680+
public int getMaxShingleDiff() { return this.maxShingleDiff; }
681+
682+
private void setMaxShingleDiff(int maxShingleDiff) { this.maxShingleDiff = maxShingleDiff; }
683+
644684
/**
645685
* Returns the maximum number of allowed script_fields to retrieve in a search request
646686
*/

core/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,16 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
8484

8585
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
8686
super(indexSettings, name, settings);
87+
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
8788
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
8889
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
90+
int ngramDiff = maxGram - minGram;
91+
if (ngramDiff > maxAllowedNgramDiff) {
92+
throw new IllegalArgumentException(
93+
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + maxAllowedNgramDiff
94+
+ "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
95+
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
96+
}
8997
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
9098
}
9199

core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,16 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
3232

3333
public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3434
super(indexSettings, name, settings);
35+
int maxAllowedShingleDiff = indexSettings.getMaxShingleDiff();
3536
Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
3637
Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
38+
int shingleDiff = maxShingleSize - minShingleSize;
39+
if (shingleDiff > maxAllowedShingleDiff) {
40+
throw new IllegalArgumentException(
41+
"The difference between max_shingle_size and min_shingle_size in Shingle Token Filter must be less than or equal to: ["
42+
+ maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit can be set by changing the ["
43+
+ IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.");
44+
}
3745
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
3846
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
3947
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);

core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.lucene.analysis.Tokenizer;
2828
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
2929
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
30+
import org.elasticsearch.index.IndexSettings;
3031
import org.elasticsearch.test.ESTestCase;
3132
import org.elasticsearch.test.ESTokenStreamTestCase;
3233

@@ -102,4 +103,25 @@ public void testDisableGraph() throws IOException {
102103
assertFalse(stream.hasAttribute(DisableGraphAttribute.class));
103104
}
104105
}
106+
107+
/*`
108+
* test that throws an error when trying to get a ShingleTokenFilter where difference between max_shingle_size and min_shingle_size
109+
* is greater than the allowed value of max_shingle_diff
110+
*/
111+
public void testMaxShingleDiffException() throws Exception{
112+
String RESOURCE2 = "/org/elasticsearch/index/analysis/shingle_analysis2.json";
113+
int maxAllowedShingleDiff = 3;
114+
int shingleDiff = 8;
115+
try {
116+
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE2);
117+
analysis.tokenFilter.get("shingle");
118+
fail();
119+
} catch (IllegalArgumentException ex) {
120+
assertEquals(
121+
"The difference between max_shingle_size and min_shingle_size in Shingle Token Filter must be less than or equal to: ["
122+
+ maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit can be set by changing the ["
123+
+ IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.",
124+
ex.getMessage());
125+
}
126+
}
105127
}

core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.elasticsearch.common.settings.Settings;
3131
import org.elasticsearch.common.xcontent.XContentFactory;
3232
import org.elasticsearch.common.xcontent.XContentType;
33+
import org.elasticsearch.index.IndexSettings;
3334
import org.elasticsearch.index.query.BoolQueryBuilder;
3435
import org.elasticsearch.index.query.MatchQueryBuilder;
3536
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
@@ -1802,6 +1803,7 @@ public void testSearchEmptyDoc() {
18021803
public void testNGramCopyField() {
18031804
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
18041805
.put(indexSettings())
1806+
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 9)
18051807
.put("index.analysis.analyzer.my_ngram_analyzer.type", "custom")
18061808
.put("index.analysis.analyzer.my_ngram_analyzer.tokenizer", "my_ngram_tokenizer")
18071809
.put("index.analysis.tokenizer.my_ngram_tokenizer.type", "nGram")
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"index":{
3+
"analysis":{
4+
"filter":{
5+
"shingle_filler":{
6+
"type":"shingle",
7+
"max_shingle_size" : 10,
8+
"min_shingle_size" : 2,
9+
"output_unigrams" : false,
10+
"filler_token" : "FILLER"
11+
}
12+
}
13+
}
14+
}
15+
}

docs/reference/index-modules.asciidoc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,16 @@ specific index module:
144144
The maximum number of `script_fields` that are allowed in a query.
145145
Defaults to `32`.
146146

147+
`index.max_ngram_diff`::
148+
149+
The maximum allowed difference between min_gram and max_gram for NGramTokenizer and NGramTokenFilter.
150+
Defaults to `1`.
151+
152+
`index.max_shingle_diff`::
153+
154+
The maximum allowed difference between max_shingle_size and min_shingle_size for ShingleTokenFilter.
155+
Defaults to `3`.
156+
147157
`index.blocks.read_only`::
148158

149159
Set to `true` to make the index and index metadata read only, `false` to

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,16 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
3636

3737
NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3838
super(indexSettings, name, settings);
39+
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
3940
this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
4041
this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
42+
int ngramDiff = maxGram - minGram;
43+
if (ngramDiff > maxAllowedNgramDiff) {
44+
throw new IllegalArgumentException(
45+
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + maxAllowedNgramDiff
46+
+ "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
47+
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
48+
}
4149
}
4250

4351
@Override

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.elasticsearch.action.search.SearchResponse;
2323
import org.elasticsearch.common.settings.Settings;
24+
import org.elasticsearch.index.IndexSettings;
2425
import org.elasticsearch.index.query.Operator;
2526
import org.elasticsearch.plugins.Plugin;
2627
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
@@ -66,6 +67,7 @@ public void testNgramHighlightingWithBrokenPositions() throws IOException {
6667
.endObject())
6768
.setSettings(Settings.builder()
6869
.put(indexSettings())
70+
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 19)
6971
.put("analysis.tokenizer.autocomplete.max_gram", 20)
7072
.put("analysis.tokenizer.autocomplete.min_gram", 1)
7173
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ public void testParseTokenChars() {
7676
public void testNoTokenChars() throws IOException {
7777
final Index index = new Index("test", "_na_");
7878
final String name = "ngr";
79-
final Settings indexSettings = newAnalysisSettingsBuilder().build();
79+
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();
80+
8081
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4)
8182
.putList("token_chars", new String[0]).build();
8283
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
@@ -152,6 +153,34 @@ public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception {
152153
}
153154

154155

156+
/*`
157+
* test that throws an error when trying to get a NGramTokenizer where difference between max_gram and min_gram
158+
* is greater than the allowed value of max_ngram_diff
159+
*/
160+
public void testMaxNGramDiffException() throws Exception{
161+
final Index index = new Index("test", "_na_");
162+
final String name = "ngr";
163+
final Settings indexSettings = newAnalysisSettingsBuilder().build();
164+
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
165+
166+
int maxAllowedNgramDiff = indexProperties.getMaxNgramDiff();
167+
int ngramDiff = maxAllowedNgramDiff + 1;
168+
int min_gram = 2;
169+
int max_gram = min_gram + ngramDiff;
170+
171+
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", min_gram).put("max_gram", max_gram).build();
172+
try {
173+
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
174+
fail();
175+
} catch (IllegalArgumentException ex) {
176+
assertEquals(
177+
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + maxAllowedNgramDiff
178+
+ "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
179+
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.",
180+
ex.getMessage());
181+
}
182+
}
183+
155184
private Version randomVersion(Random random) throws IllegalArgumentException, IllegalAccessException {
156185
Field[] declaredFields = Version.class.getFields();
157186
List<Field> versionFields = new ArrayList<>();

modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,18 @@
2727
- match: { detail.tokenizer.tokens.2.token: od }
2828

2929
---
30+
"nGram_exception":
31+
- do:
32+
catch: /The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to[:] \[1\] but was \[2\]\. This limit can be set by changing the \[index.max_ngram_diff\] index level setting\./
33+
indices.analyze:
34+
body:
35+
text: good
36+
explain: true
37+
tokenizer:
38+
type: nGram
39+
min_gram: 2
40+
max_gram: 4
41+
---
3042
"simple_pattern":
3143
- do:
3244
indices.analyze:

modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
settings:
77
number_of_shards: 1
88
number_of_replicas: 0
9+
index.max_ngram_diff: 19
910
analysis:
1011
tokenizer:
1112
my_ngramt:

0 commit comments

Comments
 (0)