Skip to content

Commit 148376c

Browse files
Add limits for ngram and shingle settings (#27211)
* Add limits for ngram and shingle settings (#27211) Create index-level settings: max_ngram_diff - maximum allowed difference between max_gram and min_gram in NGramTokenFilter/NGramTokenizer. Default is 1. max_shingle_diff - maximum allowed difference between max_shingle_size and min_shingle_size in ShingleTokenFilter. Default is 3. Throw an IllegalArgumentException when trying to create NGramTokenFilter, NGramTokenizer, ShingleTokenFilter where difference between max_size and min_size exceeds the settings value. Closes #25887
1 parent 2fc6c64 commit 148376c

File tree

17 files changed

+193
-1
lines changed

17 files changed

+193
-1
lines changed

core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
114114
IndexSettings.MAX_INNER_RESULT_WINDOW_SETTING,
115115
IndexSettings.MAX_DOCVALUE_FIELDS_SEARCH_SETTING,
116116
IndexSettings.MAX_SCRIPT_FIELDS_SETTING,
117+
IndexSettings.MAX_NGRAM_DIFF_SETTING,
118+
IndexSettings.MAX_SHINGLE_DIFF_SETTING,
117119
IndexSettings.MAX_RESCORE_WINDOW_SETTING,
118120
IndexSettings.MAX_ADJACENCY_MATRIX_FILTERS_SETTING,
119121
IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,
@@ -150,6 +152,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
150152
EngineConfig.INDEX_CODEC_SETTING,
151153
EngineConfig.INDEX_OPTIMIZE_AUTO_GENERATED_IDS,
152154
IndexMetaData.SETTING_WAIT_FOR_ACTIVE_SHARDS,
155+
153156
// validate that built-in similarities don't get redefined
154157
Setting.groupSetting("index.similarity.", (s) -> {
155158
Map<String, Settings> groups = s.getAsGroups();

core/src/main/java/org/elasticsearch/index/IndexSettings.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,26 @@ public final class IndexSettings {
107107
public static final Setting<Integer> MAX_SCRIPT_FIELDS_SETTING =
108108
Setting.intSetting("index.max_script_fields", 32, 0, Property.Dynamic, Property.IndexScope);
109109

110+
/**
111+
* Index setting describing for NGramTokenizer and NGramTokenFilter
112+
* the maximum difference between
113+
* max_gram (maximum length of characters in a gram) and
114+
* min_gram (minimum length of characters in a gram).
115+
* The default value is 1 as this is default difference in NGramTokenizer,
116+
* and is defensive as it prevents generating too many index terms.
117+
*/
118+
public static final Setting<Integer> MAX_NGRAM_DIFF_SETTING =
119+
Setting.intSetting("index.max_ngram_diff", 1, 0, Property.Dynamic, Property.IndexScope);
120+
121+
/**
122+
* Index setting describing for ShingleTokenFilter
123+
* the maximum difference between
124+
* max_shingle_size and min_shingle_size.
125+
* The default value is 3 is defensive as it prevents generating too many tokens.
126+
*/
127+
public static final Setting<Integer> MAX_SHINGLE_DIFF_SETTING =
128+
Setting.intSetting("index.max_shingle_diff", 3, 0, Property.Dynamic, Property.IndexScope);
129+
110130
/**
111131
* Index setting describing the maximum value of allowed `docvalue_fields`that can be retrieved
112132
* per search request. The default maximum of 100 is defensive for the reason that retrieving
@@ -239,6 +259,8 @@ public final class IndexSettings {
239259
private volatile int maxRescoreWindow;
240260
private volatile int maxDocvalueFields;
241261
private volatile int maxScriptFields;
262+
private volatile int maxNgramDiff;
263+
private volatile int maxShingleDiff;
242264
private volatile boolean TTLPurgeDisabled;
243265
/**
244266
* The maximum number of refresh listeners allows on this shard.
@@ -342,6 +364,8 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
342364
maxRescoreWindow = scopedSettings.get(MAX_RESCORE_WINDOW_SETTING);
343365
maxDocvalueFields = scopedSettings.get(MAX_DOCVALUE_FIELDS_SEARCH_SETTING);
344366
maxScriptFields = scopedSettings.get(MAX_SCRIPT_FIELDS_SETTING);
367+
maxNgramDiff = scopedSettings.get(MAX_NGRAM_DIFF_SETTING);
368+
maxShingleDiff = scopedSettings.get(MAX_SHINGLE_DIFF_SETTING);
345369
TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
346370
maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
347371
maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
@@ -373,6 +397,8 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
373397
scopedSettings.addSettingsUpdateConsumer(MAX_RESCORE_WINDOW_SETTING, this::setMaxRescoreWindow);
374398
scopedSettings.addSettingsUpdateConsumer(MAX_DOCVALUE_FIELDS_SEARCH_SETTING, this::setMaxDocvalueFields);
375399
scopedSettings.addSettingsUpdateConsumer(MAX_SCRIPT_FIELDS_SETTING, this::setMaxScriptFields);
400+
scopedSettings.addSettingsUpdateConsumer(MAX_NGRAM_DIFF_SETTING, this::setMaxNgramDiff);
401+
scopedSettings.addSettingsUpdateConsumer(MAX_SHINGLE_DIFF_SETTING, this::setMaxShingleDiff);
376402
scopedSettings.addSettingsUpdateConsumer(INDEX_WARMER_ENABLED_SETTING, this::setEnableWarmer);
377403
scopedSettings.addSettingsUpdateConsumer(INDEX_GC_DELETES_SETTING, this::setGCDeletes);
378404
scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING, this::setTranslogFlushThresholdSize);
@@ -641,6 +667,20 @@ private void setMaxDocvalueFields(int maxDocvalueFields) {
641667
this.maxDocvalueFields = maxDocvalueFields;
642668
}
643669

670+
/**
671+
* Returns the maximum allowed difference between max and min length of ngram
672+
*/
673+
public int getMaxNgramDiff() { return this.maxNgramDiff; }
674+
675+
private void setMaxNgramDiff(int maxNgramDiff) { this.maxNgramDiff = maxNgramDiff; }
676+
677+
/**
678+
* Returns the maximum allowed difference between max and min shingle_size
679+
*/
680+
public int getMaxShingleDiff() { return this.maxShingleDiff; }
681+
682+
private void setMaxShingleDiff(int maxShingleDiff) { this.maxShingleDiff = maxShingleDiff; }
683+
644684
/**
645685
* Returns the maximum number of allowed script_fields to retrieve in a search request
646686
*/

core/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.apache.lucene.analysis.Tokenizer;
2323
import org.apache.lucene.analysis.ngram.NGramTokenizer;
24+
import org.elasticsearch.Version;
2425
import org.elasticsearch.common.settings.Settings;
2526
import org.elasticsearch.env.Environment;
2627
import org.elasticsearch.index.IndexSettings;
@@ -84,8 +85,21 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
8485

8586
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
8687
super(indexSettings, name, settings);
88+
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
8789
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
8890
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
91+
int ngramDiff = maxGram - minGram;
92+
if (ngramDiff > maxAllowedNgramDiff) {
93+
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
94+
throw new IllegalArgumentException(
95+
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
96+
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
97+
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
98+
} else {
99+
deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
100+
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
101+
}
102+
}
89103
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
90104
}
91105

core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.lucene.analysis.TokenStream;
2323
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2424
import org.apache.lucene.analysis.shingle.ShingleFilter;
25+
import org.elasticsearch.Version;
2526
import org.elasticsearch.common.settings.Settings;
2627
import org.elasticsearch.env.Environment;
2728
import org.elasticsearch.index.IndexSettings;
@@ -32,9 +33,24 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
3233

3334
public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3435
super(indexSettings, name, settings);
36+
int maxAllowedShingleDiff = indexSettings.getMaxShingleDiff();
3537
Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
3638
Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
3739
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
40+
41+
int shingleDiff = maxShingleSize - minShingleSize + (outputUnigrams ? 1 : 0);
42+
if (shingleDiff > maxAllowedShingleDiff) {
43+
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
44+
throw new IllegalArgumentException(
45+
"In Shingle TokenFilter the difference between max_shingle_size and min_shingle_size (and +1 if outputting unigrams)"
46+
+ " must be less than or equal to: [" + maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit"
47+
+ " can be set by changing the [" + IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.");
48+
} else {
49+
deprecationLogger.deprecated("Deprecated big difference between maxShingleSize and minShingleSize in Shingle TokenFilter,"
50+
+ "expected difference must be less than or equal to: [" + maxAllowedShingleDiff + "]");
51+
}
52+
}
53+
3854
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
3955
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
4056
String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);

core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import org.apache.lucene.analysis.Tokenizer;
2828
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
2929
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
30+
import org.elasticsearch.index.IndexSettings;
3031
import org.elasticsearch.test.ESTestCase;
3132
import org.elasticsearch.test.ESTokenStreamTestCase;
3233

@@ -102,4 +103,25 @@ public void testDisableGraph() throws IOException {
102103
assertFalse(stream.hasAttribute(DisableGraphAttribute.class));
103104
}
104105
}
106+
107+
/*`
108+
* test that throws an error when trying to get a ShingleTokenFilter where difference between max_shingle_size and min_shingle_size
109+
* is greater than the allowed value of max_shingle_diff
110+
*/
111+
public void testMaxShingleDiffException() throws Exception{
112+
String RESOURCE2 = "/org/elasticsearch/index/analysis/shingle_analysis2.json";
113+
int maxAllowedShingleDiff = 3;
114+
int shingleDiff = 8;
115+
try {
116+
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE2);
117+
analysis.tokenFilter.get("shingle");
118+
fail();
119+
} catch (IllegalArgumentException ex) {
120+
assertEquals(
121+
"In Shingle TokenFilter the difference between max_shingle_size and min_shingle_size (and +1 if outputting unigrams)"
122+
+ " must be less than or equal to: [" + maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit"
123+
+ " can be set by changing the [" + IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.",
124+
ex.getMessage());
125+
}
126+
}
105127
}

core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.elasticsearch.common.settings.Settings;
3131
import org.elasticsearch.common.xcontent.XContentFactory;
3232
import org.elasticsearch.common.xcontent.XContentType;
33+
import org.elasticsearch.index.IndexSettings;
3334
import org.elasticsearch.index.query.BoolQueryBuilder;
3435
import org.elasticsearch.index.query.MatchQueryBuilder;
3536
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
@@ -1802,6 +1803,7 @@ public void testSearchEmptyDoc() {
18021803
public void testNGramCopyField() {
18031804
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
18041805
.put(indexSettings())
1806+
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 9)
18051807
.put("index.analysis.analyzer.my_ngram_analyzer.type", "custom")
18061808
.put("index.analysis.analyzer.my_ngram_analyzer.tokenizer", "my_ngram_tokenizer")
18071809
.put("index.analysis.tokenizer.my_ngram_tokenizer.type", "nGram")

core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.elasticsearch.common.settings.Settings;
2929
import org.elasticsearch.common.xcontent.XContentBuilder;
3030
import org.elasticsearch.common.xcontent.XContentFactory;
31+
import org.elasticsearch.index.IndexSettings;
3132
import org.elasticsearch.plugins.Plugin;
3233
import org.elasticsearch.plugins.ScriptPlugin;
3334
import org.elasticsearch.script.ScriptContext;
@@ -683,6 +684,7 @@ public void testDifferentShardSize() throws Exception {
683684
public void testShardFailures() throws IOException, InterruptedException {
684685
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
685686
.put(indexSettings())
687+
.put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
686688
.put("index.analysis.analyzer.suggest.tokenizer", "standard")
687689
.putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
688690
.put("index.analysis.filter.shingler.type", "shingle")
@@ -743,6 +745,7 @@ public void testEmptyShards() throws IOException, InterruptedException {
743745
endObject();
744746
assertAcked(prepareCreate("test").setSettings(Settings.builder()
745747
.put(indexSettings())
748+
.put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
746749
.put("index.analysis.analyzer.suggest.tokenizer", "standard")
747750
.putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
748751
.put("index.analysis.filter.shingler.type", "shingle")
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"index":{
3+
"analysis":{
4+
"filter":{
5+
"shingle_filler":{
6+
"type":"shingle",
7+
"max_shingle_size" : 10,
8+
"min_shingle_size" : 2,
9+
"output_unigrams" : false,
10+
"filler_token" : "FILLER"
11+
}
12+
}
13+
}
14+
}
15+
}

docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,6 @@ type:
1313
|`max_gram` |Defaults to `2`.
1414
|============================
1515

16+
The index level setting `index.max_ngram_diff` controls the maximum allowed
17+
difference between `max_gram` and `min_gram`.
18+

docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,5 @@ used if the position increment is greater than one when a `stop` filter is used
3838
together with the `shingle` filter. Defaults to `"_"`
3939
|=======================================================================
4040

41+
The index level setting `index.max_shingle_diff` controls the maximum allowed
42+
difference between `max_shingle_size` and `min_shingle_size`.

docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,9 @@ value. The smaller the length, the more documents will match but the lower
198198
the quality of the matches. The longer the length, the more specific the
199199
matches. A tri-gram (length `3`) is a good place to start.
200200

201+
The index level setting `index.max_ngram_diff` controls the maximum allowed
202+
difference between `max_gram` and `min_gram`.
203+
201204
[float]
202205
=== Example configuration
203206

docs/reference/index-modules.asciidoc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,16 @@ specific index module:
144144
The maximum number of `script_fields` that are allowed in a query.
145145
Defaults to `32`.
146146

147+
`index.max_ngram_diff`::
148+
149+
The maximum allowed difference between min_gram and max_gram for NGramTokenizer and NGramTokenFilter.
150+
Defaults to `1`.
151+
152+
`index.max_shingle_diff`::
153+
154+
The maximum allowed difference between max_shingle_size and min_shingle_size for ShingleTokenFilter.
155+
Defaults to `3`.
156+
147157
`index.blocks.read_only`::
148158

149159
Set to `true` to make the index and index metadata read only, `false` to

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
import org.elasticsearch.env.Environment;
2626
import org.elasticsearch.index.IndexSettings;
2727
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
28+
import org.elasticsearch.Version;
29+
2830

2931

3032
public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
@@ -36,8 +38,21 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
3638

3739
NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
3840
super(indexSettings, name, settings);
41+
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
3942
this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
4043
this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
44+
int ngramDiff = maxGram - minGram;
45+
if (ngramDiff > maxAllowedNgramDiff) {
46+
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
47+
throw new IllegalArgumentException(
48+
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
49+
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
50+
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
51+
} else {
52+
deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
53+
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
54+
}
55+
}
4156
}
4257

4358
@Override

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.elasticsearch.action.search.SearchResponse;
2323
import org.elasticsearch.common.settings.Settings;
24+
import org.elasticsearch.index.IndexSettings;
2425
import org.elasticsearch.index.query.Operator;
2526
import org.elasticsearch.plugins.Plugin;
2627
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
@@ -66,6 +67,7 @@ public void testNgramHighlightingWithBrokenPositions() throws IOException {
6667
.endObject())
6768
.setSettings(Settings.builder()
6869
.put(indexSettings())
70+
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 19)
6971
.put("analysis.tokenizer.autocomplete.max_gram", 20)
7072
.put("analysis.tokenizer.autocomplete.min_gram", 1)
7173
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")

0 commit comments

Comments
 (0)