elastic · mayya-sharipova · Nov 17, 2017 · Nov 16, 2017 · Nov 16, 2017
diff --git a/core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java b/core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java
@@ -114,6 +114,8 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
         IndexSettings.MAX_INNER_RESULT_WINDOW_SETTING,
         IndexSettings.MAX_DOCVALUE_FIELDS_SEARCH_SETTING,
         IndexSettings.MAX_SCRIPT_FIELDS_SETTING,
+        IndexSettings.MAX_NGRAM_DIFF_SETTING,
+        IndexSettings.MAX_SHINGLE_DIFF_SETTING,
         IndexSettings.MAX_RESCORE_WINDOW_SETTING,
         IndexSettings.MAX_ADJACENCY_MATRIX_FILTERS_SETTING,
         IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,

diff --git a/core/src/main/java/org/elasticsearch/index/IndexSettings.java b/core/src/main/java/org/elasticsearch/index/IndexSettings.java
@@ -120,6 +120,27 @@ public final class IndexSettings {
      */
     public static final Setting<Integer> MAX_INNER_RESULT_WINDOW_SETTING =
         Setting.intSetting("index.max_inner_result_window", 100, 1, Property.Dynamic, Property.IndexScope);
+
+    /**
+     * Index setting describing for NGramTokenizer and NGramTokenFilter
+     * the maximum difference between
+     * max_gram (maximum length of characters in a gram) and
+     * min_gram (minimum length of characters in a gram).
+     * The default value is 1 as this is default difference in NGramTokenizer,
+     * and is defensive as it prevents generating too many index terms.
+     */
+    public static final Setting<Integer> MAX_NGRAM_DIFF_SETTING =
+        Setting.intSetting("index.max_ngram_diff", 1, 0, Property.Dynamic, Property.IndexScope);
+
+    /**
+     * Index setting describing for ShingleTokenFilter
+     * the maximum difference between
+     * max_shingle_size and min_shingle_size.
+     * The default value is 3 is defensive as it prevents generating too many tokens.
+     */
+    public static final Setting<Integer> MAX_SHINGLE_DIFF_SETTING =
+        Setting.intSetting("index.max_shingle_diff", 3, 0, Property.Dynamic, Property.IndexScope);
+
     /**
      * Index setting describing the maximum value of allowed `docvalue_fields`that can be retrieved
      * per search request. The default maximum of 100 is defensive for the reason that retrieving
@@ -252,6 +273,8 @@ public final class IndexSettings {
     private volatile int maxRescoreWindow;
     private volatile int maxDocvalueFields;
     private volatile int maxScriptFields;
+    private volatile int maxNgramDiff;
+    private volatile int maxShingleDiff;
     private volatile boolean TTLPurgeDisabled;
     /**
      * The maximum number of refresh listeners allows on this shard.
@@ -355,6 +378,8 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
         maxRescoreWindow = scopedSettings.get(MAX_RESCORE_WINDOW_SETTING);
         maxDocvalueFields = scopedSettings.get(MAX_DOCVALUE_FIELDS_SEARCH_SETTING);
         maxScriptFields = scopedSettings.get(MAX_SCRIPT_FIELDS_SETTING);
+        maxNgramDiff = scopedSettings.get(MAX_NGRAM_DIFF_SETTING);
+        maxShingleDiff = scopedSettings.get(MAX_SHINGLE_DIFF_SETTING);
         TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
         maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
         maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
@@ -386,6 +411,8 @@ public IndexSettings(final IndexMetaData indexMetaData, final Settings nodeSetti
         scopedSettings.addSettingsUpdateConsumer(MAX_RESCORE_WINDOW_SETTING, this::setMaxRescoreWindow);
         scopedSettings.addSettingsUpdateConsumer(MAX_DOCVALUE_FIELDS_SEARCH_SETTING, this::setMaxDocvalueFields);
         scopedSettings.addSettingsUpdateConsumer(MAX_SCRIPT_FIELDS_SETTING, this::setMaxScriptFields);
+        scopedSettings.addSettingsUpdateConsumer(MAX_NGRAM_DIFF_SETTING, this::setMaxNgramDiff);
+        scopedSettings.addSettingsUpdateConsumer(MAX_SHINGLE_DIFF_SETTING, this::setMaxShingleDiff);
         scopedSettings.addSettingsUpdateConsumer(INDEX_WARMER_ENABLED_SETTING, this::setEnableWarmer);
         scopedSettings.addSettingsUpdateConsumer(INDEX_GC_DELETES_SETTING, this::setGCDeletes);
         scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING, this::setTranslogFlushThresholdSize);
@@ -654,6 +681,20 @@ private void setMaxDocvalueFields(int maxDocvalueFields) {
         this.maxDocvalueFields = maxDocvalueFields;
     }
 
+    /**
+     * Returns the maximum allowed difference between max and min length of ngram
+     */
+    public int getMaxNgramDiff() { return this.maxNgramDiff; }
+
+    private void setMaxNgramDiff(int maxNgramDiff) { this.maxNgramDiff = maxNgramDiff; }
+
+    /**
+     * Returns the maximum allowed difference between max and min shingle_size
+     */
+    public int getMaxShingleDiff() { return this.maxShingleDiff; }
+
+    private void setMaxShingleDiff(int maxShingleDiff) { this.maxShingleDiff = maxShingleDiff; }
+
     /**
      * Returns the maximum number of allowed script_fields to retrieve in a search request
      */

diff --git a/core/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java
@@ -84,8 +84,14 @@ static CharMatcher parseTokenChars(List<String> characterClasses) {
 
     public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
+        int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
         this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
         this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
+        int ngramDiff = maxGram - minGram;
+        if (ngramDiff > maxAllowedNgramDiff) {
+            deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
+                + "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
+        }
         this.matcher = parseTokenChars(settings.getAsList("token_chars"));
     }
 

diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java
@@ -32,10 +32,17 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
 
     public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
+        int maxAllowedShingleDiff = indexSettings.getMaxShingleDiff();
         Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
         Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
         Boolean outputUnigrams = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "output_unigrams", true, deprecationLogger);
         Boolean outputUnigramsIfNoShingles = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "output_unigrams_if_no_shingles", false, deprecationLogger);
+
+        int shingleDiff = maxShingleSize - minShingleSize + (outputUnigrams ? 1 : 0);
+        if (shingleDiff > maxAllowedShingleDiff) {
+            deprecationLogger.deprecated("Deprecated big difference between maxShingleSize and minShingleSize in Shingle TokenFilter,"
+                + "expected difference must be less than or equal to: [" + maxAllowedShingleDiff + "]");
+        }
         String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
         String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
         factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);

diff --git a/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java b/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java
@@ -30,6 +30,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.query.BoolQueryBuilder;
 import org.elasticsearch.index.query.MatchQueryBuilder;
 import org.elasticsearch.index.query.MultiMatchQueryBuilder;
@@ -1802,6 +1803,7 @@ public void testSearchEmptyDoc() {
     public void testNGramCopyField() {
         CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
                 .put(indexSettings())
+                .put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 9)
                 .put("index.analysis.analyzer.my_ngram_analyzer.type", "custom")
                 .put("index.analysis.analyzer.my_ngram_analyzer.tokenizer", "my_ngram_tokenizer")
                 .put("index.analysis.tokenizer.my_ngram_tokenizer.type", "nGram")

diff --git a/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java b/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java
@@ -28,6 +28,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.plugins.ScriptPlugin;
 import org.elasticsearch.script.ScriptContext;
@@ -683,6 +684,7 @@ public void testDifferentShardSize() throws Exception {
     public void testShardFailures() throws IOException, InterruptedException {
         CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
                 .put(indexSettings())
+                .put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
                 .put("index.analysis.analyzer.suggest.tokenizer", "standard")
                 .putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
                 .put("index.analysis.filter.shingler.type", "shingle")
@@ -743,6 +745,7 @@ public void testEmptyShards() throws IOException, InterruptedException {
                 endObject();
         assertAcked(prepareCreate("test").setSettings(Settings.builder()
                 .put(indexSettings())
+                .put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
                 .put("index.analysis.analyzer.suggest.tokenizer", "standard")
                 .putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
                 .put("index.analysis.filter.shingler.type", "shingle")

diff --git a/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc
@@ -13,3 +13,6 @@ type:
 |`max_gram` |Defaults to `2`.
 |============================
 
+The index level setting `index.max_ngram_diff` controls the maximum allowed
+difference between `max_gram` and `min_gram`.
+
diff --git a/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc
@@ -38,3 +38,5 @@ used if the position increment is greater than one when a `stop` filter is used
 together with the `shingle` filter. Defaults to `"_"`
 |=======================================================================
 
+The index level setting `index.max_shingle_diff` controls the maximum allowed
+difference between `max_shingle_size` and `min_shingle_size`.
diff --git a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc
@@ -198,6 +198,9 @@ value.  The smaller the length, the more documents will match but the lower
 the quality of the matches.  The longer the length, the more specific the
 matches.  A tri-gram (length `3`) is a good place to start.
 
+The index level setting `index.max_ngram_diff` controls the maximum allowed
+difference between `max_gram` and `min_gram`.
+
 [float]
 === Example configuration
 

diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc
@@ -144,6 +144,16 @@ specific index module:
     The maximum number of `script_fields` that are allowed in a query.
     Defaults to `32`.
 
+`index.max_ngram_diff`::
+
+    The maximum allowed difference between min_gram and max_gram for NGramTokenizer and NGramTokenFilter.
+    Defaults to `1`.
+
+`index.max_shingle_diff`::
+
+    The maximum allowed difference between max_shingle_size and min_shingle_size for ShingleTokenFilter.
+    Defaults to `3`.
+
 `index.blocks.read_only`::
 
     Set to `true` to make the index and index metadata read only, `false` to

diff --git a/docs/reference/migration/migrate_6_0/indices.asciidoc b/docs/reference/migration/migrate_6_0/indices.asciidoc
@@ -73,3 +73,19 @@ has been removed in index expressions.
 Translog files are now kept for up to 12 hours (by default), with a maximum size of `512mb` (default), and
 are no longer deleted on `flush`. This is to increase the chance of doing an operation based recovery when
 bringing up replicas up to speed.
+
+==== Limit to the difference between max_size and min_size in NGramTokenFilter and NGramTokenizer
+
+To safeguard against creating too many index terms, the difference between
+max_ngram and min_ngram in NGramTokenFilter and NGramTokenizer has been
+limited to 1. This default limit can be changed with the index setting
+index.max_ngram_diff. Note that if the limit is exceeded a deprecation
+warning is logged.
+
+==== Limit to the difference between max_size and min_size in ShingleTokenFilter
+
+To safeguard against creating too many tokens, the difference between
+max_shingle_size and min_shingle_size in ShingleTokenFilter has been
+limited to 3. This default limit can be changed with the index setting
+index.max_shingle_diff. Note that if the limit is exceeded a deprecation
+warning will be logged.
diff --git a/...lysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java b/...lysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java
@@ -27,6 +27,7 @@
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 
 
+
 public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
 
     private final int minGram;
@@ -36,8 +37,14 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
 
     NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
+        int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
         this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
         this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
+        int ngramDiff = maxGram - minGram;
+        if (ngramDiff > maxAllowedNgramDiff) {
+            deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
+                + "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
+        }
     }
 
     @Override

diff --git a/...common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java b/...common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java
@@ -21,6 +21,7 @@
 
 import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.query.Operator;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
@@ -66,6 +67,7 @@ public void testNgramHighlightingWithBrokenPositions() throws IOException {
                         .endObject())
                 .setSettings(Settings.builder()
                         .put(indexSettings())
+                        .put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 19)
                         .put("analysis.tokenizer.autocomplete.max_gram", 20)
                         .put("analysis.tokenizer.autocomplete.min_gram", 1)
                         .put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")

diff --git a/...s/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml b/...s/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml
@@ -27,6 +27,23 @@
     - match:  { detail.tokenizer.tokens.2.token: od }
 
 ---
+"nGram_exception":
+    - skip:
+        version: " - 6.0.99"
+        reason: starting from version 6.1.0 this produces a warning
+        features: "warnings"
+    - do:
+        indices.analyze:
+          body:
+            text: good
+            explain: true
+            tokenizer:
+              type: nGram
+              min_gram: 2
+              max_gram: 4
+        warnings:
+          - "Deprecated big difference between max_gram and min_gram in NGram Tokenizer,expected difference must be less than or equal to: [1]"
+---
 "simple_pattern":
     - do:
         indices.analyze:

diff --git a/...lysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml b/...lysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml
@@ -6,6 +6,7 @@
           settings:
             number_of_shards: 1
             number_of_replicas: 0
+            index.max_ngram_diff: 19
             analysis:
               tokenizer:
                 my_ngramt: