Add analysis modes to restrict token filter use contexts (elastic#36103)

Christoph Büscher · web-flow · commit ef18d3fb5b21 · 2019-03-12T23:46:55.000+01:00
Currently token filter settings are treated as fixed once they are declared and used in an analyzer. This is done to prevent changes in analyzers that are already used actively to index documents, since changes to the analysis chain could corrupt the index. However, it would be safe to allow updates to token filters at search time ("search_analyzer"). This change introduces a new property of token filters that allows to mark them as only being usable at search or at index time. Any analyzer that uses these tokenfilters inherits that property and can be rejected if they are used in other contexts. This is a first step towards making specific token filters (e.g. synonym filter) updateable. Relates to elastic#29051
diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisMode.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisMode.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+/**
+ * Enum representing the mode in which token filters and analyzers are allowed to operate.
+ * While most token filters are allowed both in index and search time analyzers, some are
+ * restricted to be used only at index time, others at search time.
+ */
+public enum AnalysisMode {
+
+    /**
+     * AnalysisMode representing analysis components that can be used only at index time
+     */
+    INDEX_TIME("index time") {
+        @Override
+        public AnalysisMode merge(AnalysisMode other) {
+            if (other == AnalysisMode.SEARCH_TIME) {
+                throw new IllegalStateException("Cannot merge SEARCH_TIME and INDEX_TIME analysis mode.");
+            }
+            return AnalysisMode.INDEX_TIME;
+        }
+    },
+    /**
+     * AnalysisMode representing analysis components that can be used only at search time
+     */
+    SEARCH_TIME("search time") {
+        @Override
+        public AnalysisMode merge(AnalysisMode other) {
+            if (other == AnalysisMode.INDEX_TIME) {
+                throw new IllegalStateException("Cannot merge SEARCH_TIME and INDEX_TIME analysis mode.");
+            }
+            return AnalysisMode.SEARCH_TIME;
+        }
+    },
+    /**
+     * AnalysisMode representing analysis components that can be used both at index and search time
+     */
+    ALL("all") {
+        @Override
+        public AnalysisMode merge(AnalysisMode other) {
+            return other;
+        }
+    };
+
+    private String readableName;
+
+    AnalysisMode(String name) {
+        this.readableName = name;
+    }
+
+    public String getReadableName() {
+        return this.readableName;
+    }
+
+    /**
+     * Returns a mode that is compatible with both this mode and the other mode, that is:
+     * <ul>
+     * <li>ALL.merge(INDEX_TIME) == INDEX_TIME</li>
+     * <li>ALL.merge(SEARCH_TIME) == SEARCH_TIME</li>
+     * <li>INDEX_TIME.merge(SEARCH_TIME) throws an {@link IllegalStateException}</li>
+     * </ul>
+     */
+    abstract AnalysisMode merge(AnalysisMode other);
+}
diff --git a/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java b/server/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
@@ -20,11 +20,11 @@
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.elasticsearch.Version;
-import org.elasticsearch.core.internal.io.IOUtils;
 import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.Version;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.core.internal.io.IOUtils;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexSettings;
@@ -456,6 +456,7 @@ public IndexAnalyzers build(IndexSettings indexSettings,
         if (defaultAnalyzer == null) {
             throw new IllegalArgumentException("no default analyzer configured");
         }
+        defaultAnalyzer.checkAllowedInMode(AnalysisMode.ALL);
         if (analyzers.containsKey("default_index")) {
             throw new IllegalArgumentException("setting [index.analysis.analyzer.default_index] is not supported anymore, use " +
                 "[index.analysis.analyzer.default] instead for index [" + index.getName() + "]");
diff --git a/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzer.java b/server/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzer.java
@@ -36,6 +36,7 @@ public final class CustomAnalyzer extends Analyzer {
 
     private final int positionIncrementGap;
     private final int offsetGap;
+    private final AnalysisMode analysisMode;
 
     public CustomAnalyzer(String tokenizerName, TokenizerFactory tokenizerFactory, CharFilterFactory[] charFilters,
             TokenFilterFactory[] tokenFilters) {
@@ -50,6 +51,12 @@ public CustomAnalyzer(String tokenizerName, TokenizerFactory tokenizerFactory, C
         this.tokenFilters = tokenFilters;
         this.positionIncrementGap = positionIncrementGap;
         this.offsetGap = offsetGap;
+        // merge and transfer token filter analysis modes with analyzer
+        AnalysisMode mode = AnalysisMode.ALL;
+        for (TokenFilterFactory f : tokenFilters) {
+            mode = mode.merge(f.getAnalysisMode());
+        }
+        this.analysisMode = mode;
     }
 
     /**
@@ -84,6 +91,10 @@ public int getOffsetGap(String field) {
         return this.offsetGap;
     }
 
+    public AnalysisMode getAnalysisMode() {
+        return this.analysisMode;
+    }
+
     @Override
     protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = tokenizerFactory.create();
diff --git a/server/src/main/java/org/elasticsearch/index/analysis/NamedAnalyzer.java b/server/src/main/java/org/elasticsearch/index/analysis/NamedAnalyzer.java
@@ -21,7 +21,10 @@
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
+import org.elasticsearch.index.mapper.MapperException;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Objects;
 
 /**
@@ -34,6 +37,7 @@ public class NamedAnalyzer extends DelegatingAnalyzerWrapper {
     private final AnalyzerScope scope;
     private final Analyzer analyzer;
     private final int positionIncrementGap;
+    private final AnalysisMode analysisMode;
 
     public NamedAnalyzer(NamedAnalyzer analyzer, int positionIncrementGap) {
         this(analyzer.name(), analyzer.scope(), analyzer.analyzer(), positionIncrementGap);
@@ -43,12 +47,17 @@ public NamedAnalyzer(String name, AnalyzerScope scope, Analyzer analyzer) {
         this(name, scope, analyzer, Integer.MIN_VALUE);
     }
 
-    public NamedAnalyzer(String name, AnalyzerScope scope, Analyzer analyzer, int positionIncrementGap) {
+    NamedAnalyzer(String name, AnalyzerScope scope, Analyzer analyzer, int positionIncrementGap) {
         super(ERROR_STRATEGY);
         this.name = name;
         this.scope = scope;
         this.analyzer = analyzer;
         this.positionIncrementGap = positionIncrementGap;
+        if (analyzer instanceof org.elasticsearch.index.analysis.CustomAnalyzer) {
+            this.analysisMode = ((org.elasticsearch.index.analysis.CustomAnalyzer) analyzer).getAnalysisMode();
+        } else {
+            this.analysisMode = AnalysisMode.ALL;
+        }
     }
 
     /**
@@ -65,6 +74,13 @@ public AnalyzerScope scope() {
         return this.scope;
     }
 
+    /**
+     * Returns whether this analyzer can be updated
+     */
+    public AnalysisMode getAnalysisMode() {
+        return this.analysisMode;
+    }
+
     /**
      * The actual analyzer.
      */
@@ -85,9 +101,37 @@ public int getPositionIncrementGap(String fieldName) {
         return super.getPositionIncrementGap(fieldName);
     }
 
+    /**
+     * Checks the wrapped analyzer for the provided restricted {@link AnalysisMode} and throws
+     * an error if the analyzer is not allowed to run in that mode. The error contains more detailed information about
+     * the offending filters that caused the analyzer to not be allowed in this mode.
+     */
+    public void checkAllowedInMode(AnalysisMode mode) {
+        Objects.requireNonNull(mode);
+        if (this.getAnalysisMode() == AnalysisMode.ALL) {
+            return; // everything allowed if this analyzer is in ALL mode
+        }
+        if (this.getAnalysisMode() != mode) {
+            if (analyzer instanceof CustomAnalyzer) {
+                TokenFilterFactory[] tokenFilters = ((CustomAnalyzer) analyzer).tokenFilters();
+                List<String> offendingFilters = new ArrayList<>();
+                for (TokenFilterFactory tokenFilter : tokenFilters) {
+                    if (tokenFilter.getAnalysisMode() != mode) {
+                        offendingFilters.add(tokenFilter.name());
+                    }
+                }
+                throw new MapperException("analyzer [" + name + "] contains filters " + offendingFilters
+                        + " that are not allowed to run in " + mode.getReadableName() + " mode.");
+            } else {
+                throw new MapperException(
+                        "analyzer [" + name + "] contains components that are not allowed to run in " + mode.getReadableName() + " mode.");
+            }
+        }
+    }
+
     @Override
     public String toString() {
-        return "analyzer name[" + name + "], analyzer [" + analyzer + "]";
+        return "analyzer name[" + name + "], analyzer [" + analyzer + "], analysisMode [" + analysisMode + "]";
     }
 
     /** It is an error if this is ever used, it means we screwed up! */
diff --git a/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java
@@ -74,6 +74,15 @@ default TokenFilterFactory getSynonymFilter() {
         return this;
     }
 
+    /**
+     * Get the {@link AnalysisMode} this filter is allowed to be used in. The default is
+     * {@link AnalysisMode#ALL}. Instances need to override this method to define their
+     * own restrictions.
+     */
+    default AnalysisMode getAnalysisMode() {
+        return AnalysisMode.ALL;
+    }
+
     /**
      * A TokenFilterFactory that does no filtering to its TokenStream
      */
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TypeParsers.java b/server/src/main/java/org/elasticsearch/index/mapper/TypeParsers.java
@@ -23,6 +23,7 @@
 import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.common.time.DateFormatter;
 import org.elasticsearch.common.xcontent.support.XContentMapValues;
+import org.elasticsearch.index.analysis.AnalysisMode;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.index.similarity.SimilarityProvider;
 
@@ -80,18 +81,37 @@ private static void parseAnalyzersAndTermVectors(FieldMapper.Builder builder, St
                 if (analyzer == null) {
                     throw new MapperParsingException("analyzer [" + propNode.toString() + "] not found for field [" + name + "]");
                 }
+                analyzer.checkAllowedInMode(AnalysisMode.SEARCH_TIME);
                 searchAnalyzer = analyzer;
                 iterator.remove();
             } else if (propName.equals("search_quote_analyzer")) {
                 NamedAnalyzer analyzer = parserContext.getIndexAnalyzers().get(propNode.toString());
                 if (analyzer == null) {
                     throw new MapperParsingException("analyzer [" + propNode.toString() + "] not found for field [" + name + "]");
                 }
+                analyzer.checkAllowedInMode(AnalysisMode.SEARCH_TIME);
                 searchQuoteAnalyzer = analyzer;
                 iterator.remove();
             }
         }
 
+        // check analyzers are allowed to work in the respective AnalysisMode
+        {
+            if (indexAnalyzer != null) {
+                if (searchAnalyzer == null) {
+                    indexAnalyzer.checkAllowedInMode(AnalysisMode.ALL);
+                } else {
+                    indexAnalyzer.checkAllowedInMode(AnalysisMode.INDEX_TIME);
+                }
+            }
+            if (searchAnalyzer != null) {
+                searchAnalyzer.checkAllowedInMode(AnalysisMode.SEARCH_TIME);
+            }
+            if (searchQuoteAnalyzer != null) {
+                searchQuoteAnalyzer.checkAllowedInMode(AnalysisMode.SEARCH_TIME);
+            }
+        }
+
         if (indexAnalyzer == null && searchAnalyzer != null) {
             throw new MapperParsingException("analyzer on field [" + name + "] must be set when search_analyzer is set");
         }
diff --git a/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/server/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java
@@ -20,6 +20,8 @@
 package org.elasticsearch.index.analysis;
 
 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
@@ -31,6 +33,7 @@
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.env.TestEnvironment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.mapper.MapperException;
 import org.elasticsearch.indices.analysis.AnalysisModule;
 import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
 import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
@@ -102,6 +105,29 @@ public void testOverrideDefaultAnalyzer() throws IOException {
         assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
     }
 
+    public void testOverrideDefaultAnalyzerWithoutAnalysisModeAll() throws IOException {
+        Version version = VersionUtils.randomVersion(random());
+        Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
+        TokenFilterFactory tokenFilter = new AbstractTokenFilterFactory(IndexSettingsModule.newIndexSettings("index", settings),
+                "my_filter", Settings.EMPTY) {
+            @Override
+            public AnalysisMode getAnalysisMode() {
+                return randomFrom(AnalysisMode.SEARCH_TIME, AnalysisMode.INDEX_TIME);
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                return null;
+            }
+        };
+        Analyzer analyzer = new CustomAnalyzer("tokenizerName", null, new CharFilterFactory[0], new TokenFilterFactory[] { tokenFilter });
+        MapperException ex = expectThrows(MapperException.class,
+                () -> emptyRegistry.build(IndexSettingsModule.newIndexSettings("index", settings),
+                        singletonMap("default", new PreBuiltAnalyzerProvider("my_analyzer", AnalyzerScope.INDEX, analyzer)), emptyMap(),
+                        emptyMap(), emptyMap(), emptyMap()));
+        assertEquals("analyzer [my_analyzer] contains filters [my_filter] that are not allowed to run in all mode.", ex.getMessage());
+    }
+
     public void testOverrideDefaultIndexAnalyzerIsUnsupported() {
         Version version = VersionUtils.randomVersionBetween(random(), Version.V_6_0_0_alpha1, Version.CURRENT);
         Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, version).build();
diff --git a/server/src/test/java/org/elasticsearch/index/analysis/NamedAnalyzerTests.java b/server/src/test/java/org/elasticsearch/index/analysis/NamedAnalyzerTests.java
diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TypeParsersTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TypeParsersTests.java