Skip to content

Commit e26f6f1

Browse files
Add limits for ngram and shingle settings
Create index-level settings: max_ngram_diff - maximum allowed difference between max_gram and min_gram in NGramTokenFilter/NGramTokenizer. Default is 1. max_shingle_diff - maximum allowed difference betweenmax_shingle_size and min_shingle_size in ShingleTokenFilter. Default is 3. Throw an IllegalArgumentException from v7.X when trying to create NGramTokenFilter, NGramTokenizer, ShingleTokenFilter where difference between max_size and min_size exceeds the settings value. Create a deprecated warning for versions < 7.X for this case. Closes elastic#25887
1 parent 0234287 commit e26f6f1

File tree

10 files changed

+59
-27
lines changed

10 files changed

+59
-27
lines changed

core/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.apache.lucene.analysis.Tokenizer;
2323
import org.apache.lucene.analysis.ngram.NGramTokenizer;
24+
import org.elasticsearch.Version;
2425
import org.elasticsearch.common.settings.Settings;
2526
import org.elasticsearch.env.Environment;
2627
import org.elasticsearch.index.IndexSettings;
@@ -89,10 +90,15 @@ public NGramTokenizerFactory(IndexSettings indexSettings, Environment environmen
8990
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
9091
int ngramDiff = maxGram - minGram;
9192
if (ngramDiff > maxAllowedNgramDiff) {
92-
throw new IllegalArgumentException(
93-
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + maxAllowedNgramDiff
94-
+ "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
95-
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
93+
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
94+
throw new IllegalArgumentException(
95+
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
96+
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
97+
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
98+
} else {
99+
deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
100+
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
101+
}
96102
}
97103
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
98104
}

core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.lucene.analysis.TokenStream;
2323
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
2424
import org.apache.lucene.analysis.shingle.ShingleFilter;
25+
import org.elasticsearch.Version;
2526
import org.elasticsearch.common.settings.Settings;
2627
import org.elasticsearch.env.Environment;
2728
import org.elasticsearch.index.IndexSettings;
@@ -35,14 +36,21 @@ public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment enviro
3536
int maxAllowedShingleDiff = indexSettings.getMaxShingleDiff();
3637
Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
3738
Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
38-
int shingleDiff = maxShingleSize - minShingleSize;
39+
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
40+
41+
int shingleDiff = maxShingleSize - minShingleSize + (outputUnigrams ? 1 : 0);
3942
if (shingleDiff > maxAllowedShingleDiff) {
40-
throw new IllegalArgumentException(
41-
"The difference between max_shingle_size and min_shingle_size in Shingle Token Filter must be less than or equal to: ["
42-
+ maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit can be set by changing the ["
43-
+ IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.");
43+
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
44+
throw new IllegalArgumentException(
45+
"In Shingle TokenFilter the difference between max_shingle_size and min_shingle_size (and +1 if outputting unigrams)"
46+
+ " must be less than or equal to: [" + maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit"
47+
+ " can be set by changing the [" + IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.");
48+
} else {
49+
deprecationLogger.deprecated("Deprecated big difference between maxShingleSize and minShingleSize in Shingle TokenFilter,"
50+
+ "expected difference must be less than or equal to: [" + maxAllowedShingleDiff + "]");
51+
}
4452
}
45-
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
53+
4654
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
4755
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
4856
String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);

core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,9 @@ public void testMaxShingleDiffException() throws Exception{
118118
fail();
119119
} catch (IllegalArgumentException ex) {
120120
assertEquals(
121-
"The difference between max_shingle_size and min_shingle_size in Shingle Token Filter must be less than or equal to: ["
122-
+ maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit can be set by changing the ["
123-
+ IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.",
121+
"In Shingle TokenFilter the difference between max_shingle_size and min_shingle_size (and +1 if outputting unigrams)"
122+
+ " must be less than or equal to: [" + maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit"
123+
+ " can be set by changing the [" + IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.",
124124
ex.getMessage());
125125
}
126126
}

core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.elasticsearch.common.settings.Settings;
2929
import org.elasticsearch.common.xcontent.XContentBuilder;
3030
import org.elasticsearch.common.xcontent.XContentFactory;
31+
import org.elasticsearch.index.IndexSettings;
3132
import org.elasticsearch.plugins.Plugin;
3233
import org.elasticsearch.plugins.ScriptPlugin;
3334
import org.elasticsearch.script.ScriptContext;
@@ -683,6 +684,7 @@ public void testDifferentShardSize() throws Exception {
683684
public void testShardFailures() throws IOException, InterruptedException {
684685
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
685686
.put(indexSettings())
687+
.put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
686688
.put("index.analysis.analyzer.suggest.tokenizer", "standard")
687689
.putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
688690
.put("index.analysis.filter.shingler.type", "shingle")
@@ -743,6 +745,7 @@ public void testEmptyShards() throws IOException, InterruptedException {
743745
endObject();
744746
assertAcked(prepareCreate("test").setSettings(Settings.builder()
745747
.put(indexSettings())
748+
.put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
746749
.put("index.analysis.analyzer.suggest.tokenizer", "standard")
747750
.putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
748751
.put("index.analysis.filter.shingler.type", "shingle")

docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,6 @@ type:
1313
|`max_gram` |Defaults to `2`.
1414
|============================
1515

16+
The index level setting `index.max_ngram_diff` controls the maximum allowed
17+
difference between `max_gram` and `min_gram`.
18+

docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,5 @@ used if the position increment is greater than one when a `stop` filter is used
3838
together with the `shingle` filter. Defaults to `"_"`
3939
|=======================================================================
4040

41+
The index level setting `index.max_shingle_diff` controls the maximum allowed
42+
difference between `max_shingle_size` and `min_shingle_size`.

docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,9 @@ value. The smaller the length, the more documents will match but the lower
198198
the quality of the matches. The longer the length, the more specific the
199199
matches. A tri-gram (length `3`) is a good place to start.
200200

201+
The index level setting `index.max_ngram_diff` controls the maximum allowed
202+
difference between `max_gram` and `min_gram`.
203+
201204
[float]
202205
=== Example configuration
203206

modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
import org.elasticsearch.env.Environment;
2626
import org.elasticsearch.index.IndexSettings;
2727
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
28+
import org.elasticsearch.Version;
29+
2830

2931

3032
public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
@@ -41,10 +43,15 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
4143
this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
4244
int ngramDiff = maxGram - minGram;
4345
if (ngramDiff > maxAllowedNgramDiff) {
44-
throw new IllegalArgumentException(
45-
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + maxAllowedNgramDiff
46-
+ "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
47-
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
46+
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
47+
throw new IllegalArgumentException(
48+
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
49+
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
50+
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
51+
} else {
52+
deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
53+
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
54+
}
4855
}
4956
}
5057

modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -169,16 +169,13 @@ public void testMaxNGramDiffException() throws Exception{
169169
int max_gram = min_gram + ngramDiff;
170170

171171
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", min_gram).put("max_gram", max_gram).build();
172-
try {
173-
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
174-
fail();
175-
} catch (IllegalArgumentException ex) {
176-
assertEquals(
177-
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + maxAllowedNgramDiff
178-
+ "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
179-
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.",
180-
ex.getMessage());
181-
}
172+
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () ->
173+
new NGramTokenizerFactory(indexProperties, null, name, settings).create());
174+
assertEquals(
175+
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
176+
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
177+
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.",
178+
ex.getMessage());
182179
}
183180

184181
private Version randomVersion(Random random) throws IllegalArgumentException, IllegalAccessException {

modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828

2929
---
3030
"nGram_exception":
31+
- skip:
32+
version: " - 6.99.99"
33+
reason: only starting from version 7.x this throws an error
3134
- do:
3235
catch: /The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to[:] \[1\] but was \[2\]\. This limit can be set by changing the \[index.max_ngram_diff\] index level setting\./
3336
indices.analyze:

0 commit comments

Comments
 (0)