elastic · matriv · Feb 16, 2021 · Jan 12, 2021 · Jan 12, 2021 · Jan 12, 2021
diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc
@@ -223,6 +223,7 @@ specific index module:
     The maximum number of tokens that can be produced using _analyze API.
     Defaults to `10000`.
 
+[[index-max-analyzed-offset]]
  `index.highlight.max_analyzed_offset`::
 
      The maximum number of characters that will be analyzed for a highlight request.

diff --git a/docs/reference/search/search-your-data/highlighting.asciidoc b/docs/reference/search/search-your-data/highlighting.asciidoc
@@ -117,7 +117,7 @@ needs highlighting. The `plain` highlighter always uses plain highlighting.
 Plain highlighting for large texts may require substantial amount of time and memory.
 To protect against this, the maximum number of text characters that will be analyzed has been
 limited to 1000000. This default limit can be changed
-for a particular index with the index setting `index.highlight.max_analyzed_offset`.
+for a particular index with the index setting <<index-max-analyzed-offset,`index.highlight.max_analyzed_offset`>>.
 
 [discrete]
 [[highlighting-settings]]
@@ -242,6 +242,14 @@ require_field_match:: By default, only fields that contains a query match are
 highlighted. Set `require_field_match` to `false` to highlight all fields.
 Defaults to `true`.
 
+limit_to_max_analyzed_offset:: By default, the maximum number of characters
+analyzed for a highlight request is bounded by the value defined in the
+<<index-max-analyzed-offset, `index.highlight.max_analyzed_offset`>> setting,
+and when the number of characters exceeds this limit an error is returned. If
+this setting is set to `true`, the analysis for the highlighting stops at this
+defined maximum limit, and the rest of the text is not processed, thus not
+highlighted and no error is returned.
+
 tags_schema:: Set to `styled` to use the built-in tag schema. The `styled`
 schema defines the following `pre_tags` and defines `post_tags` as
 `</em>`.
@@ -1119,4 +1127,4 @@ using the passages's `matchStarts` and `matchEnds` information:
     I'll be the <em>only</em> <em>fox</em> in the world for you.
 
 This kind of formatted strings are the final result of the highlighter returned
-to the user.
+to the user.
diff --git a/...java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java b/...java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java
@@ -42,6 +42,7 @@
 import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
 import org.apache.lucene.search.uhighlight.Snippet;
 import org.apache.lucene.search.uhighlight.SplittingBreakIterator;
+import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
 import org.apache.lucene.store.Directory;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
@@ -59,78 +60,97 @@
 
 public class AnnotatedTextHighlighterTests extends ESTestCase {
 
+    private void assertHighlightOneDoc(String fieldName, String[] markedUpInputs,
+                                       Query query, Locale locale, BreakIterator breakIterator,
+                                       int noMatchSize, String[] expectedPassages) throws Exception {
+
+        assertHighlightOneDoc(fieldName, markedUpInputs, query, locale, breakIterator, noMatchSize, expectedPassages,
+                Integer.MAX_VALUE, false);
+    }
+
     private void assertHighlightOneDoc(String fieldName, String []markedUpInputs,
             Query query, Locale locale, BreakIterator breakIterator,
-            int noMatchSize, String[] expectedPassages) throws Exception {
-
-
-        // Annotated fields wrap the usual analyzer with one that injects extra tokens
-        Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
-        Directory dir = newDirectory();
-        IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
-        iwc.setMergePolicy(newTieredMergePolicy(random()));
-        RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
-        FieldType ft = new FieldType(TextField.TYPE_STORED);
-        if (randomBoolean()) {
-            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
-        } else {
-            ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
-        }
-        ft.freeze();
-        Document doc = new Document();
-        for (String input : markedUpInputs) {
-            Field field = new Field(fieldName, "", ft);
-            field.setStringValue(input);
-            doc.add(field);
+            int noMatchSize, String[] expectedPassages,
+            int maxAnalyzedOffset, boolean limitToMaxAnalyzedOffset) throws Exception {
+
+        Directory dir = null;
+        DirectoryReader reader = null;
+        try {
+            dir = newDirectory();
+
+            // Annotated fields wrap the usual analyzer with one that injects extra tokens
+            Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
+            ;
+            IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
+            iwc.setMergePolicy(newTieredMergePolicy(random()));
+            RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+            FieldType ft = new FieldType(TextField.TYPE_STORED);
+            if (randomBoolean()) {
+                ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+            } else {
+                ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
+            }
+            ft.freeze();
+            Document doc = new Document();
+            for (String input : markedUpInputs) {
+                Field field = new Field(fieldName, "", ft);
+                field.setStringValue(input);
+                doc.add(field);
+            }
+            iw.addDocument(doc);
+            reader = iw.getReader();
+            IndexSearcher searcher = newSearcher(reader);
+            iw.close();
+
+            AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
+            for (int i = 0; i < markedUpInputs.length; i++) {
+                annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
+            }
+            AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
+            hiliteAnalyzer.setAnnotations(annotations);
+            AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
+            passageFormatter.setAnnotations(annotations);
+
+            ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
+            for (int i = 0; i < annotations.length; i++) {
+                plainTextForHighlighter.add(annotations[i].textMinusMarkup);
+            }
+
+            TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
+            assertThat(topDocs.totalHits.value, equalTo(1L));
+            String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
+            CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
+                    searcher,
+                    hiliteAnalyzer,
+                    UnifiedHighlighter.OffsetSource.ANALYSIS,
+                    passageFormatter,
+                    locale,
+                    breakIterator,
+                    "index",
+                    "text",
+                    query,
+                    noMatchSize,
+                    expectedPassages.length,
+                    name -> "text".equals(name),
+                    maxAnalyzedOffset,
+                    limitToMaxAnalyzedOffset
+            );
+            highlighter.setFieldMatcher((name) -> "text".equals(name));
+            final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
+            assertEquals(expectedPassages.length, snippets.length);
+            for (int i = 0; i < snippets.length; i++) {
+                assertEquals(expectedPassages[i], snippets[i].getText());
+            }
+        } finally {
+            if (reader != null) {
+                reader.close();
+            }
+            if (dir != null) {
+                dir.close();
+            }
         }
-        iw.addDocument(doc);
-        DirectoryReader reader = iw.getReader();
-        IndexSearcher searcher = newSearcher(reader);
-        iw.close();
-
-        AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
-        for (int i = 0; i < markedUpInputs.length; i++) {
-            annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
-        }
-        AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
-        hiliteAnalyzer.setAnnotations(annotations);
-        AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
-        passageFormatter.setAnnotations(annotations);
-
-        ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
-        for (int i = 0; i < annotations.length; i++) {
-            plainTextForHighlighter.add(annotations[i].textMinusMarkup);
-        }
-
-        TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
-        assertThat(topDocs.totalHits.value, equalTo(1L));
-        String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
-        CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
-            searcher,
-            hiliteAnalyzer,
-            null,
-            passageFormatter,
-            locale,
-            breakIterator,
-            "index",
-            "text",
-            query,
-            noMatchSize,
-            expectedPassages.length,
-            name -> "text".equals(name),
-            Integer.MAX_VALUE
-        );
-        highlighter.setFieldMatcher((name) -> "text".equals(name));
-        final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
-        assertEquals(expectedPassages.length, snippets.length);
-        for (int i = 0; i < snippets.length; i++) {
-            assertEquals(expectedPassages[i], snippets[i].getText());
-        }
-        reader.close();
-        dir.close();
     }
 
-
     public void testAnnotatedTextStructuredMatch() throws Exception {
         // Check that a structured token eg a URL can be highlighted in a query
         // on marked-up
@@ -202,4 +222,33 @@ public void testBadAnnotation() throws Exception {
         assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
     }
 
+    public void testExceedMaxAnalyzedOffset() throws Exception {
+        TermQuery query = new TermQuery(new Term("text", "exceeds"));
+        BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
+        assertHighlightOneDoc("text", new String[] { "[Short Text](Short+Text)" }, query, Locale.ROOT, breakIterator, 0, new String[] {});
+
+        IllegalArgumentException e = expectThrows(
+            IllegalArgumentException.class,
+            () -> assertHighlightOneDoc(
+                "text",
+                new String[] { "[Long Text exceeds](Long+Text+exceeds) MAX analyzed offset)" },
+                query,
+                Locale.ROOT,
+                breakIterator,
+                0,
+                new String[] {},
+                15,
+                false
+            )
+        );
+        assertEquals(
+            "The length of [text] field of [0] doc of [index] index has exceeded [15] - maximum allowed to be analyzed for "
+                + "highlighting. This maximum can be set by changing the [index.highlight.max_analyzed_offset] index level setting. "
+                + "For large texts, indexing with offsets or term vectors is recommended!",
+            e.getMessage()
+        );
+
+        assertHighlightOneDoc("text", new String[] { "[Long Text exceeds](Long+Text+exceeds) MAX analyzed offset" },
+            query, Locale.ROOT, breakIterator, 0, new String[] {"Long Text [exceeds](_hit_term=exceeds) MAX analyzed offset"}, 15, true);
+    }
 }
diff --git a/...pi-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml b/...pi-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml
@@ -6,7 +6,7 @@ setup:
           body:
               settings:
                   number_of_shards: 1
-                  index.highlight.max_analyzed_offset: 10
+                  index.highlight.max_analyzed_offset: 20
               mappings:
                   properties:
                       field1:
@@ -37,6 +37,16 @@ setup:
           body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
   - match: { error.root_cause.0.type: "illegal_argument_exception" }
 
+---
+"Unified highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset with limit_to_max_analyzed_offset=true should SUCCEED":
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: test1
+        body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}, "limit_to_max_analyzed_offset": "true"}}
+  - match: {hits.hits.0.highlight.field1.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
+
 
 ---
 "Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
@@ -46,9 +56,19 @@ setup:
       search:
           rest_total_hits_as_int: true
           index: test1
-          body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
+          body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}}}
   - match: { error.root_cause.0.type: "illegal_argument_exception" }
 
+---
+"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset with limit_to_max_analyzed_offset=true should SUCCEED":
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: test1
+        body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field1" : {}}, "limit_to_max_analyzed_offset": "true"}}
+  - match: {hits.hits.0.highlight.field1.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
+
 
 ---
 "Unified highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should SUCCEED":
@@ -71,3 +91,13 @@ setup:
           index: test1
           body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}}}
   - match: { error.root_cause.0.type: "illegal_argument_exception" }
+
+---
+"Plain highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset with limit_to_max_analyzed_offset=true should SUCCEED":
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: test1
+        body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}, "limit_to_max_analyzed_offset": "true"}}
+  - match: {hits.hits.0.highlight.field2.0: "The quick brown <em>fox</em> went to the forest and saw another fox."}
diff --git a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
@@ -65,6 +65,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
     private final int noMatchSize;
     private final FieldHighlighter fieldHighlighter;
     private final int maxAnalyzedOffset;
+    private final boolean limitToMaxAnalyzedOffset;
 
     /**
      * Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -96,7 +97,8 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
                                     int noMatchSize,
                                     int maxPassages,
                                     Predicate<String> fieldMatcher,
-                                    int maxAnalyzedOffset) throws IOException {
+                                    int maxAnalyzedOffset,
+                                    boolean limitToMaxAnalyzedOffset) throws IOException {
         super(searcher, analyzer);
         this.offsetSource = offsetSource;
         this.breakIterator = breakIterator;
@@ -107,6 +109,7 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
         this.noMatchSize = noMatchSize;
         this.setFieldMatcher(fieldMatcher);
         this.maxAnalyzedOffset = maxAnalyzedOffset;
+        this.limitToMaxAnalyzedOffset = limitToMaxAnalyzedOffset;
         fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
     }
 
@@ -123,7 +126,7 @@ public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier<St
             return null;
         }
         int fieldValueLength = fieldValue.length();
-        if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) {
+        if (((limitToMaxAnalyzedOffset == false) && (offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset))) {
             throw new IllegalArgumentException(
                 "The length of ["
                     + field