Skip to content

Commit 29d8c2f

Browse files
Added lenient option for WordnetSynonymParser
-- also added more documentation
1 parent bdc0ce1 commit 29d8c2f

File tree

8 files changed

+258
-20
lines changed

8 files changed

+258
-20
lines changed

docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,40 @@ The above configures a `search_synonyms` filter, with a path of
5454
Additional settings are:
5555

5656
* `expand` (defaults to `true`).
57-
* `lenient` (defaults to `false`). If `true` ignores exceptions while parsing the synonym configuration.
57+
* `lenient` (defaults to `false`). If `true` ignores exceptions while parsing the synonym configuration. It is important
58+
to note that only those synonym rules which cannot get parsed are ignored. For instance consider the following request:
59+
60+
[source,js]
61+
--------------------------------------------------
62+
PUT /test_index
63+
{
64+
"settings": {
65+
"index" : {
66+
"analysis" : {
67+
"analyzer" : {
68+
"synonym" : {
69+
"tokenizer" : "standard",
70+
"filter" : ["my_stop", "synonym_graph"]
71+
}
72+
},
73+
"filter" : {
74+
"my_stop": {
75+
"type" : "stop",
76+
"stopwords": ["bar"]
77+
},
78+
"synonym_graph" : {
79+
"type" : "synonym_graph",
80+
"lenient": true,
81+
"synonyms" : ["foo, bar => baz"]
82+
}
83+
}
84+
}
85+
}
86+
}
87+
}
88+
--------------------------------------------------
89+
// CONSOLE
90+
With the above request the word `bar` gets skipped but a mapping `foo => baz` is still added.
5891

5992
[float]
6093
==== `tokenizer` and `ignore_case` are deprecated

docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,47 @@ The above configures a `synonym` filter, with a path of
3535
`analysis/synonym.txt` (relative to the `config` location). The
3636
`synonym` analyzer is then configured with the filter.
3737

38+
This filter tokenize synonyms with whatever tokenizer and token filters
39+
appear before it in the chain.
40+
3841
Additional settings are:
3942

4043
* `expand` (defaults to `true`).
41-
* `lenient` (defaults to `false`). If `true` ignores exceptions while parsing the synonym configuration.
44+
* `lenient` (defaults to `false`). If `true` ignores exceptions while parsing the synonym configuration. It is important
45+
to note that only those synonym rules which cannot get parsed are ignored. For instance consider the following request:
46+
47+
[source,js]
48+
--------------------------------------------------
49+
PUT /test_index
50+
{
51+
"settings": {
52+
"index" : {
53+
"analysis" : {
54+
"analyzer" : {
55+
"synonym" : {
56+
"tokenizer" : "standard",
57+
"filter" : ["my_stop", "synonym"]
58+
}
59+
},
60+
"filter" : {
61+
"my_stop": {
62+
"type" : "stop",
63+
"stopwords": ["bar"]
64+
},
65+
"synonym" : {
66+
"type" : "synonym",
67+
"lenient": true,
68+
"synonyms" : ["foo, bar => baz"]
69+
}
70+
}
71+
}
72+
}
73+
}
74+
}
75+
--------------------------------------------------
76+
// CONSOLE
77+
With the above request the word `bar` gets skipped but a mapping `foo => baz` is still added.
4278

43-
This filter tokenize synonyms with whatever tokenizer and token filters
44-
appear before it in the chain.
4579

4680
[float]
4781
==== `tokenizer` and `ignore_case` are deprecated

server/src/main/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParser.java renamed to server/src/main/java/org/elasticsearch/index/analysis/ESSolrSynonymParser.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,13 @@
2828

2929
import java.io.IOException;
3030

31-
public class ElasticsearchSynonymParser extends SolrSynonymParser {
31+
public class ESSolrSynonymParser extends SolrSynonymParser {
3232

3333
private final boolean lenient;
3434
private static final Logger logger =
35-
Loggers.getLogger(ElasticsearchSynonymParser.class, "ElasticsearchSynonymParser");
35+
Loggers.getLogger(ESSolrSynonymParser.class, "ESSolrSynonymParser");
3636

37-
public ElasticsearchSynonymParser(boolean dedup, boolean expand, boolean lenient, Analyzer analyzer) {
37+
public ESSolrSynonymParser(boolean dedup, boolean expand, boolean lenient, Analyzer analyzer) {
3838
super(dedup, expand, analyzer);
3939
this.lenient = lenient;
4040
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import org.apache.logging.log4j.Logger;
23+
import org.apache.lucene.analysis.Analyzer;
24+
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
25+
import org.apache.lucene.util.CharsRef;
26+
import org.apache.lucene.util.CharsRefBuilder;
27+
import org.elasticsearch.common.logging.Loggers;
28+
29+
import java.io.IOException;
30+
31+
public class ESWordnetSynonymParser extends WordnetSynonymParser {
32+
33+
private final boolean lenient;
34+
private static final Logger logger =
35+
Loggers.getLogger(ESSolrSynonymParser.class, "ESWordnetSynonymParser");
36+
37+
public ESWordnetSynonymParser(boolean dedup, boolean expand, boolean lenient, Analyzer analyzer) {
38+
super(dedup, expand, analyzer);
39+
this.lenient = lenient;
40+
}
41+
42+
@Override
43+
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
44+
// This condition follows up on the overridden analyze method. In case lenient was set to true and there was an
45+
// exception during super.analyze we return a zero-length CharsRef for that word which caused an exception. When
46+
// the synonym mappings for the words are added using the add method we skip the ones that were left empty by
47+
// analyze i.e., in the case when lenient is set we only add those combinations which are non-zero-length. The
48+
// else would happen only in the case when the input or output is empty and lenient is set, in which case we
49+
// quietly ignore it. For more details on the control-flow see SolrSynonymParser::addInternal.
50+
if (lenient == false || (input.length > 0 && output.length > 0)) {
51+
super.add(input, output, includeOrig);
52+
}
53+
}
54+
55+
@Override
56+
public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
57+
try {
58+
return super.analyze(text, reuse);
59+
} catch (IllegalArgumentException ex) {
60+
if (lenient) {
61+
logger.info("Synonym rule for [" + text + "] was ignored");
62+
return new CharsRef("");
63+
} else {
64+
throw ex;
65+
}
66+
}
67+
}
68+
}

server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import org.apache.lucene.analysis.TokenStream;
2424
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
2525
import org.apache.lucene.analysis.synonym.SynonymMap;
26-
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
2726
import org.elasticsearch.common.settings.Settings;
2827
import org.elasticsearch.env.Environment;
2928
import org.elasticsearch.index.IndexSettings;
@@ -57,11 +56,11 @@ public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rules
5756
try {
5857
SynonymMap.Builder parser;
5958
if ("wordnet".equalsIgnoreCase(format)) {
60-
parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym);
61-
((WordnetSynonymParser) parser).parse(rulesReader);
59+
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym);
60+
((ESWordnetSynonymParser) parser).parse(rulesReader);
6261
} else {
63-
parser = new ElasticsearchSynonymParser(true, expand, lenient, analyzerForParseSynonym);
64-
((ElasticsearchSynonymParser) parser).parse(rulesReader);
62+
parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym);
63+
((ESSolrSynonymParser) parser).parse(rulesReader);
6564
}
6665
synonymMap = parser.build();
6766
} catch (Exception e) {

server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import org.apache.lucene.analysis.TokenStream;
2424
import org.apache.lucene.analysis.synonym.SynonymFilter;
2525
import org.apache.lucene.analysis.synonym.SynonymMap;
26-
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
2726
import org.elasticsearch.common.settings.Settings;
2827
import org.elasticsearch.env.Environment;
2928
import org.elasticsearch.index.IndexSettings;
@@ -94,11 +93,11 @@ public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader
9493
try {
9594
SynonymMap.Builder parser;
9695
if ("wordnet".equalsIgnoreCase(format)) {
97-
parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym);
98-
((WordnetSynonymParser) parser).parse(rulesReader);
96+
parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym);
97+
((ESWordnetSynonymParser) parser).parse(rulesReader);
9998
} else {
100-
parser = new ElasticsearchSynonymParser(true, expand, lenient, analyzerForParseSynonym);
101-
((ElasticsearchSynonymParser) parser).parse(rulesReader);
99+
parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym);
100+
((ESSolrSynonymParser) parser).parse(rulesReader);
102101
}
103102
synonymMap = parser.build();
104103
} catch (Exception e) {

server/src/test/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParserTests.java renamed to server/src/test/java/org/elasticsearch/index/analysis/ESSolrSynonymParserTests.java

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
package org.elasticsearch.index.analysis;
2121

22+
import org.apache.lucene.analysis.CharArraySet;
23+
import org.apache.lucene.analysis.StopFilter;
2224
import org.apache.lucene.analysis.TokenStream;
2325
import org.apache.lucene.analysis.Tokenizer;
2426
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -33,10 +35,10 @@
3335

3436
import static org.hamcrest.Matchers.containsString;
3537

36-
public class ElasticsearchSynonymParserTests extends ESTokenStreamTestCase {
38+
public class ESSolrSynonymParserTests extends ESTokenStreamTestCase {
3739

3840
public void testLenientParser() throws IOException, ParseException {
39-
ElasticsearchSynonymParser parser = new ElasticsearchSynonymParser(true, false, true, new StandardAnalyzer());
41+
ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer());
4042
String rules =
4143
"&,and\n" +
4244
"come,advance,approach\n";
@@ -49,8 +51,23 @@ public void testLenientParser() throws IOException, ParseException {
4951
assertTokenStreamContents(ts, new String[]{"come", "quietly", "then", "come", "destroy"});
5052
}
5153

54+
public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
55+
CharArraySet stopSet = new CharArraySet(1, true);
56+
stopSet.add("bar");
57+
ESSolrSynonymParser parser =
58+
new ESSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
59+
String rules = "foo,bar,baz";
60+
StringReader rulesReader = new StringReader(rules);
61+
parser.parse(rulesReader);
62+
SynonymMap synonymMap = parser.build();
63+
Tokenizer tokenizer = new StandardTokenizer();
64+
tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
65+
TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
66+
assertTokenStreamContents(ts, new String[]{"first", "word", "is", "foo", "then", "and", "lastly", "foo"});
67+
}
68+
5269
public void testNonLenientParser() {
53-
ElasticsearchSynonymParser parser = new ElasticsearchSynonymParser(true, false, false, new StandardAnalyzer());
70+
ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, false, new StandardAnalyzer());
5471
String rules =
5572
"&,and=>and\n" +
5673
"come,advance,approach\n";
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.index.analysis;
21+
22+
import org.apache.lucene.analysis.CharArraySet;
23+
import org.apache.lucene.analysis.StopFilter;
24+
import org.apache.lucene.analysis.TokenStream;
25+
import org.apache.lucene.analysis.Tokenizer;
26+
import org.apache.lucene.analysis.standard.StandardAnalyzer;
27+
import org.apache.lucene.analysis.standard.StandardTokenizer;
28+
import org.apache.lucene.analysis.synonym.SynonymFilter;
29+
import org.apache.lucene.analysis.synonym.SynonymMap;
30+
import org.elasticsearch.test.ESTokenStreamTestCase;
31+
32+
import java.io.IOException;
33+
import java.io.StringReader;
34+
import java.text.ParseException;
35+
36+
import static org.hamcrest.Matchers.containsString;
37+
38+
public class ESWordnetSynonymParserTests extends ESTokenStreamTestCase {
39+
40+
public void testLenientParser() throws IOException, ParseException {
41+
ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true, new StandardAnalyzer());
42+
String rules =
43+
"s(100000001,1,'&',a,1,0).\n" +
44+
"s(100000001,2,'and',a,1,0).\n" +
45+
"s(100000002,1,'come',v,1,0).\n" +
46+
"s(100000002,2,'advance',v,1,0).\n" +
47+
"s(100000002,3,'approach',v,1,0).";
48+
StringReader rulesReader = new StringReader(rules);
49+
parser.parse(rulesReader);
50+
SynonymMap synonymMap = parser.build();
51+
Tokenizer tokenizer = new StandardTokenizer();
52+
tokenizer.setReader(new StringReader("approach quietly then advance & destroy"));
53+
TokenStream ts = new SynonymFilter(tokenizer, synonymMap, false);
54+
assertTokenStreamContents(ts, new String[]{"come", "quietly", "then", "come", "destroy"});
55+
}
56+
57+
public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException {
58+
CharArraySet stopSet = new CharArraySet(1, true);
59+
stopSet.add("bar");
60+
ESWordnetSynonymParser parser =
61+
new ESWordnetSynonymParser(true, false, true, new StandardAnalyzer(stopSet));
62+
String rules =
63+
"s(100000001,1,'foo',v,1,0).\n" +
64+
"s(100000001,2,'bar',v,1,0).\n" +
65+
"s(100000001,3,'baz',v,1,0).";
66+
StringReader rulesReader = new StringReader(rules);
67+
parser.parse(rulesReader);
68+
SynonymMap synonymMap = parser.build();
69+
Tokenizer tokenizer = new StandardTokenizer();
70+
tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz"));
71+
TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false);
72+
assertTokenStreamContents(ts, new String[]{"first", "word", "is", "foo", "then", "and", "lastly", "foo"});
73+
}
74+
75+
public void testNonLenientParser() {
76+
ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, false, new StandardAnalyzer());
77+
String rules =
78+
"s(100000001,1,'&',a,1,0).\n" +
79+
"s(100000001,2,'and',a,1,0).\n" +
80+
"s(100000002,1,'come',v,1,0).\n" +
81+
"s(100000002,2,'advance',v,1,0).\n" +
82+
"s(100000002,3,'approach',v,1,0).";
83+
StringReader rulesReader = new StringReader(rules);
84+
ParseException ex = expectThrows(ParseException.class, () -> parser.parse(rulesReader));
85+
assertThat(ex.getMessage(), containsString("Invalid synonym rule at line 1"));
86+
}
87+
88+
}

0 commit comments

Comments
 (0)