From aa9cbfbf688a6dafab1eff8378fee24d35250ccc Mon Sep 17 00:00:00 2001 From: Sohaib Iftikhar Date: Thu, 21 Jun 2018 00:55:55 +0200 Subject: [PATCH 1/6] Added lenient flag for synonym-tokenfilter. Relates to #30968 --- .../tokenfilters/synonym-tokenfilter.asciidoc | 8 ++- .../index/analysis/ElasticSynonymParser.java | 62 +++++++++++++++++++ .../SynonymGraphTokenFilterFactory.java | 5 +- .../analysis/SynonymTokenFilterFactory.java | 7 ++- .../analysis/ElasticSynonymParserTests.java | 61 ++++++++++++++++++ 5 files changed, 135 insertions(+), 8 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java create mode 100644 server/src/test/java/org/elasticsearch/index/analysis/ElasticSynonymParserTests.java diff --git a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc index 68d3f444b2d82..942fbbfd54ff5 100644 --- a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc @@ -33,8 +33,12 @@ PUT /test_index The above configures a `synonym` filter, with a path of `analysis/synonym.txt` (relative to the `config` location). The -`synonym` analyzer is then configured with the filter. Additional -settings is: `expand` (defaults to `true`). +`synonym` analyzer is then configured with the filter. + +Additional settings are: + +* `expand` (defaults to `true`). +* `lenient` (defaults to `false`). If `true` ignores exceptions while parsing the synonym configuration. This filter tokenize synonyms with whatever tokenizer and token filters appear before it in the chain. diff --git a/server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java b/server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java new file mode 100644 index 0000000000000..7168e27262317 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java @@ -0,0 +1,62 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.logging.log4j.Logger; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.elasticsearch.common.logging.Loggers; + +import java.io.IOException; + +public class ElasticSynonymParser extends SolrSynonymParser { + + private boolean lenient; + private final Logger logger; + + public ElasticSynonymParser(boolean dedup, boolean expand, boolean lenient, Analyzer analyzer) { + super(dedup, expand, analyzer); + this.lenient = lenient; + logger = Loggers.getLogger(getClass(), "ElasticSynonymParser"); + } + + @Override + public void add(CharsRef input, CharsRef output, boolean includeOrig) { + if (!lenient || (input.length > 0 && output.length > 0)) { + super.add(input, output, includeOrig); + } // the else would happen only in the case for lenient in which case we quietly ignore it + } + + @Override + public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { + try { + return super.analyze(text, reuse); + } catch (IllegalArgumentException ex) { + if (lenient) { + logger.info("Synonym rule for [" + text + "] was ignored"); + return new CharsRef(""); + } else { + throw ex; + } + } + } +} diff --git a/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java index 2f7964f63d632..196aa7d03ae69 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java @@ -21,7 +21,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; @@ -61,8 +60,8 @@ public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rules parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym); ((WordnetSynonymParser) parser).parse(rulesReader); } else { - parser = new SolrSynonymParser(true, expand, analyzerForParseSynonym); - ((SolrSynonymParser) parser).parse(rulesReader); + parser = new ElasticSynonymParser(true, expand, lenient, analyzerForParseSynonym); + ((ElasticSynonymParser) parser).parse(rulesReader); } synonymMap = parser.build(); } catch (Exception e) { diff --git a/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java index 56bae57198829..69479b201ba1c 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java @@ -21,7 +21,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; @@ -38,6 +37,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { protected final String format; protected final boolean expand; + protected final boolean lenient; protected final Settings settings; public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry, @@ -52,6 +52,7 @@ public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, A } this.expand = settings.getAsBoolean("expand", true); + this.lenient = settings.getAsBoolean("lenient", false); this.format = settings.get("format", ""); } @@ -96,8 +97,8 @@ public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym); ((WordnetSynonymParser) parser).parse(rulesReader); } else { - parser = new SolrSynonymParser(true, expand, analyzerForParseSynonym); - ((SolrSynonymParser) parser).parse(rulesReader); + parser = new ElasticSynonymParser(true, expand, lenient, analyzerForParseSynonym); + ((ElasticSynonymParser) parser).parse(rulesReader); } synonymMap = parser.build(); } catch (Exception e) { diff --git a/server/src/test/java/org/elasticsearch/index/analysis/ElasticSynonymParserTests.java b/server/src/test/java/org/elasticsearch/index/analysis/ElasticSynonymParserTests.java new file mode 100644 index 0000000000000..1017ad1fccb1d --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/analysis/ElasticSynonymParserTests.java @@ -0,0 +1,61 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.synonym.SynonymFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.elasticsearch.test.ESTokenStreamTestCase; + +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; + +import static org.hamcrest.Matchers.containsString; + +public class ElasticSynonymParserTests extends ESTokenStreamTestCase { + + public void testLenientParser() throws IOException, ParseException { + ElasticSynonymParser parser = new ElasticSynonymParser(true, false, true, new StandardAnalyzer()); + String rules = + "&,and\n" + + "come,advance,approach\n"; + StringReader rulesReader = new StringReader(rules); + parser.parse(rulesReader); + SynonymMap synonymMap = parser.build(); + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader("approach quietly then advance & destroy")); + TokenStream ts = new SynonymFilter(tokenizer, synonymMap, false); + assertTokenStreamContents(ts, new String[]{"come", "quietly", "then", "come", "destroy"}); + } + + public void testNonLenientParser() { + ElasticSynonymParser parser = new ElasticSynonymParser(true, false, false, new StandardAnalyzer()); + String rules = + "&,and=>and\n" + + "come,advance,approach\n"; + StringReader rulesReader = new StringReader(rules); + ParseException ex = expectThrows(ParseException.class, () -> parser.parse(rulesReader)); + assertThat(ex.getMessage(), containsString("Invalid synonym rule at line 1")); + } +} From 48be26afe98d4161f350fb6c51d14edc7b9d890e Mon Sep 17 00:00:00 2001 From: Sohaib Iftikhar Date: Fri, 22 Jun 2018 18:03:17 +0200 Subject: [PATCH 2/6] added docs for synonym-graph-tokenfilter -- Also made lenient final -- changed from !lenient to lenient == false --- .../tokenfilters/synonym-graph-tokenfilter.asciidoc | 6 +++++- .../elasticsearch/index/analysis/ElasticSynonymParser.java | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc index a8f2108b57a0c..0980e029e80b2 100644 --- a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc @@ -50,7 +50,11 @@ PUT /test_index The above configures a `search_synonyms` filter, with a path of `analysis/synonym.txt` (relative to the `config` location). The `search_synonyms` analyzer is then configured with the filter. -Additional settings are: `expand` (defaults to `true`). + +Additional settings are: + +* `expand` (defaults to `true`). +* `lenient` (defaults to `false`). If `true` ignores exceptions while parsing the synonym configuration. [float] ==== `tokenizer` and `ignore_case` are deprecated diff --git a/server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java b/server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java index 7168e27262317..ef48b45210918 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java @@ -30,7 +30,7 @@ public class ElasticSynonymParser extends SolrSynonymParser { - private boolean lenient; + private final boolean lenient; private final Logger logger; public ElasticSynonymParser(boolean dedup, boolean expand, boolean lenient, Analyzer analyzer) { @@ -41,7 +41,7 @@ public ElasticSynonymParser(boolean dedup, boolean expand, boolean lenient, Anal @Override public void add(CharsRef input, CharsRef output, boolean includeOrig) { - if (!lenient || (input.length > 0 && output.length > 0)) { + if (lenient == false || (input.length > 0 && output.length > 0)) { super.add(input, output, includeOrig); } // the else would happen only in the case for lenient in which case we quietly ignore it } From bdc0ce167cc397f130e6f41466a252aa81173766 Mon Sep 17 00:00:00 2001 From: Sohaib Iftikhar Date: Fri, 22 Jun 2018 23:00:58 +0200 Subject: [PATCH 3/6] Changes after review (1) -- Renamed to ElasticsearchSynonymParser -- Added explanation for ElasticsearchSynonymParser::add method -- Changed ElasticsearchSynonymParser::logger instance to static --- ...rser.java => ElasticsearchSynonymParser.java} | 16 +++++++++++----- .../analysis/SynonymGraphTokenFilterFactory.java | 4 ++-- .../analysis/SynonymTokenFilterFactory.java | 4 ++-- ...java => ElasticsearchSynonymParserTests.java} | 6 +++--- 4 files changed, 18 insertions(+), 12 deletions(-) rename server/src/main/java/org/elasticsearch/index/analysis/{ElasticSynonymParser.java => ElasticsearchSynonymParser.java} (65%) rename server/src/test/java/org/elasticsearch/index/analysis/{ElasticSynonymParserTests.java => ElasticsearchSynonymParserTests.java} (88%) diff --git a/server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java b/server/src/main/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParser.java similarity index 65% rename from server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java rename to server/src/main/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParser.java index ef48b45210918..6609c114ed15a 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/ElasticSynonymParser.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParser.java @@ -28,22 +28,28 @@ import java.io.IOException; -public class ElasticSynonymParser extends SolrSynonymParser { +public class ElasticsearchSynonymParser extends SolrSynonymParser { private final boolean lenient; - private final Logger logger; + private static final Logger logger = + Loggers.getLogger(ElasticsearchSynonymParser.class, "ElasticsearchSynonymParser"); - public ElasticSynonymParser(boolean dedup, boolean expand, boolean lenient, Analyzer analyzer) { + public ElasticsearchSynonymParser(boolean dedup, boolean expand, boolean lenient, Analyzer analyzer) { super(dedup, expand, analyzer); this.lenient = lenient; - logger = Loggers.getLogger(getClass(), "ElasticSynonymParser"); } @Override public void add(CharsRef input, CharsRef output, boolean includeOrig) { + // This condition follows up on the overridden analyze method. In case lenient was set to true and there was an + // exception during super.analyze we return a zero-length CharsRef for that word which caused an exception. When + // the synonym mappings for the words are added using the add method we skip the ones that were left empty by + // analyze i.e., in the case when lenient is set we only add those combinations which are non-zero-length. The + // else would happen only in the case when the input or output is empty and lenient is set, in which case we + // quietly ignore it. For more details on the control-flow see SolrSynonymParser::addInternal. if (lenient == false || (input.length > 0 && output.length > 0)) { super.add(input, output, includeOrig); - } // the else would happen only in the case for lenient in which case we quietly ignore it + } } @Override diff --git a/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java index 196aa7d03ae69..803994e01dbe0 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java @@ -60,8 +60,8 @@ public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rules parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym); ((WordnetSynonymParser) parser).parse(rulesReader); } else { - parser = new ElasticSynonymParser(true, expand, lenient, analyzerForParseSynonym); - ((ElasticSynonymParser) parser).parse(rulesReader); + parser = new ElasticsearchSynonymParser(true, expand, lenient, analyzerForParseSynonym); + ((ElasticsearchSynonymParser) parser).parse(rulesReader); } synonymMap = parser.build(); } catch (Exception e) { diff --git a/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java index 69479b201ba1c..a0ebbed82e231 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java @@ -97,8 +97,8 @@ public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym); ((WordnetSynonymParser) parser).parse(rulesReader); } else { - parser = new ElasticSynonymParser(true, expand, lenient, analyzerForParseSynonym); - ((ElasticSynonymParser) parser).parse(rulesReader); + parser = new ElasticsearchSynonymParser(true, expand, lenient, analyzerForParseSynonym); + ((ElasticsearchSynonymParser) parser).parse(rulesReader); } synonymMap = parser.build(); } catch (Exception e) { diff --git a/server/src/test/java/org/elasticsearch/index/analysis/ElasticSynonymParserTests.java b/server/src/test/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParserTests.java similarity index 88% rename from server/src/test/java/org/elasticsearch/index/analysis/ElasticSynonymParserTests.java rename to server/src/test/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParserTests.java index 1017ad1fccb1d..9ca4c9ff842d9 100644 --- a/server/src/test/java/org/elasticsearch/index/analysis/ElasticSynonymParserTests.java +++ b/server/src/test/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParserTests.java @@ -33,10 +33,10 @@ import static org.hamcrest.Matchers.containsString; -public class ElasticSynonymParserTests extends ESTokenStreamTestCase { +public class ElasticsearchSynonymParserTests extends ESTokenStreamTestCase { public void testLenientParser() throws IOException, ParseException { - ElasticSynonymParser parser = new ElasticSynonymParser(true, false, true, new StandardAnalyzer()); + ElasticsearchSynonymParser parser = new ElasticsearchSynonymParser(true, false, true, new StandardAnalyzer()); String rules = "&,and\n" + "come,advance,approach\n"; @@ -50,7 +50,7 @@ public void testLenientParser() throws IOException, ParseException { } public void testNonLenientParser() { - ElasticSynonymParser parser = new ElasticSynonymParser(true, false, false, new StandardAnalyzer()); + ElasticsearchSynonymParser parser = new ElasticsearchSynonymParser(true, false, false, new StandardAnalyzer()); String rules = "&,and=>and\n" + "come,advance,approach\n"; From 29d8c2fe0428ec298a43fb817f4acdb831558ebe Mon Sep 17 00:00:00 2001 From: Sohaib Iftikhar Date: Tue, 26 Jun 2018 03:09:01 +0200 Subject: [PATCH 4/6] Added lenient option for WordnetSynonymParser -- also added more documentation --- .../synonym-graph-tokenfilter.asciidoc | 35 +++++++- .../tokenfilters/synonym-tokenfilter.asciidoc | 40 ++++++++- ...ymParser.java => ESSolrSynonymParser.java} | 6 +- .../analysis/ESWordnetSynonymParser.java | 68 ++++++++++++++ .../SynonymGraphTokenFilterFactory.java | 9 +- .../analysis/SynonymTokenFilterFactory.java | 9 +- ...sts.java => ESSolrSynonymParserTests.java} | 23 ++++- .../analysis/ESWordnetSynonymParserTests.java | 88 +++++++++++++++++++ 8 files changed, 258 insertions(+), 20 deletions(-) rename server/src/main/java/org/elasticsearch/index/analysis/{ElasticsearchSynonymParser.java => ESSolrSynonymParser.java} (90%) create mode 100644 server/src/main/java/org/elasticsearch/index/analysis/ESWordnetSynonymParser.java rename server/src/test/java/org/elasticsearch/index/analysis/{ElasticsearchSynonymParserTests.java => ESSolrSynonymParserTests.java} (65%) create mode 100644 server/src/test/java/org/elasticsearch/index/analysis/ESWordnetSynonymParserTests.java diff --git a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc index 0980e029e80b2..7846f6a9038cf 100644 --- a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc @@ -54,7 +54,40 @@ The above configures a `search_synonyms` filter, with a path of Additional settings are: * `expand` (defaults to `true`). -* `lenient` (defaults to `false`). If `true` ignores exceptions while parsing the synonym configuration. +* `lenient` (defaults to `false`). If `true` ignores exceptions while parsing the synonym configuration. It is important +to note that only those synonym rules which cannot get parsed are ignored. For instance consider the following request: + +[source,js] +-------------------------------------------------- +PUT /test_index +{ + "settings": { + "index" : { + "analysis" : { + "analyzer" : { + "synonym" : { + "tokenizer" : "standard", + "filter" : ["my_stop", "synonym_graph"] + } + }, + "filter" : { + "my_stop": { + "type" : "stop", + "stopwords": ["bar"] + }, + "synonym_graph" : { + "type" : "synonym_graph", + "lenient": true, + "synonyms" : ["foo, bar => baz"] + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE +With the above request the word `bar` gets skipped but a mapping `foo => baz` is still added. [float] ==== `tokenizer` and `ignore_case` are deprecated diff --git a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc index 942fbbfd54ff5..cfa6445492250 100644 --- a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc @@ -35,13 +35,47 @@ The above configures a `synonym` filter, with a path of `analysis/synonym.txt` (relative to the `config` location). The `synonym` analyzer is then configured with the filter. +This filter tokenize synonyms with whatever tokenizer and token filters +appear before it in the chain. + Additional settings are: * `expand` (defaults to `true`). -* `lenient` (defaults to `false`). If `true` ignores exceptions while parsing the synonym configuration. +* `lenient` (defaults to `false`). If `true` ignores exceptions while parsing the synonym configuration. It is important +to note that only those synonym rules which cannot get parsed are ignored. For instance consider the following request: + +[source,js] +-------------------------------------------------- +PUT /test_index +{ + "settings": { + "index" : { + "analysis" : { + "analyzer" : { + "synonym" : { + "tokenizer" : "standard", + "filter" : ["my_stop", "synonym"] + } + }, + "filter" : { + "my_stop": { + "type" : "stop", + "stopwords": ["bar"] + }, + "synonym" : { + "type" : "synonym", + "lenient": true, + "synonyms" : ["foo, bar => baz"] + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE +With the above request the word `bar` gets skipped but a mapping `foo => baz` is still added. -This filter tokenize synonyms with whatever tokenizer and token filters -appear before it in the chain. [float] ==== `tokenizer` and `ignore_case` are deprecated diff --git a/server/src/main/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParser.java b/server/src/main/java/org/elasticsearch/index/analysis/ESSolrSynonymParser.java similarity index 90% rename from server/src/main/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParser.java rename to server/src/main/java/org/elasticsearch/index/analysis/ESSolrSynonymParser.java index 6609c114ed15a..bcc249f8a8a51 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParser.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/ESSolrSynonymParser.java @@ -28,13 +28,13 @@ import java.io.IOException; -public class ElasticsearchSynonymParser extends SolrSynonymParser { +public class ESSolrSynonymParser extends SolrSynonymParser { private final boolean lenient; private static final Logger logger = - Loggers.getLogger(ElasticsearchSynonymParser.class, "ElasticsearchSynonymParser"); + Loggers.getLogger(ESSolrSynonymParser.class, "ESSolrSynonymParser"); - public ElasticsearchSynonymParser(boolean dedup, boolean expand, boolean lenient, Analyzer analyzer) { + public ESSolrSynonymParser(boolean dedup, boolean expand, boolean lenient, Analyzer analyzer) { super(dedup, expand, analyzer); this.lenient = lenient; } diff --git a/server/src/main/java/org/elasticsearch/index/analysis/ESWordnetSynonymParser.java b/server/src/main/java/org/elasticsearch/index/analysis/ESWordnetSynonymParser.java new file mode 100644 index 0000000000000..3764820c4343d --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/analysis/ESWordnetSynonymParser.java @@ -0,0 +1,68 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.logging.log4j.Logger; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.synonym.WordnetSynonymParser; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.elasticsearch.common.logging.Loggers; + +import java.io.IOException; + +public class ESWordnetSynonymParser extends WordnetSynonymParser { + + private final boolean lenient; + private static final Logger logger = + Loggers.getLogger(ESSolrSynonymParser.class, "ESWordnetSynonymParser"); + + public ESWordnetSynonymParser(boolean dedup, boolean expand, boolean lenient, Analyzer analyzer) { + super(dedup, expand, analyzer); + this.lenient = lenient; + } + + @Override + public void add(CharsRef input, CharsRef output, boolean includeOrig) { + // This condition follows up on the overridden analyze method. In case lenient was set to true and there was an + // exception during super.analyze we return a zero-length CharsRef for that word which caused an exception. When + // the synonym mappings for the words are added using the add method we skip the ones that were left empty by + // analyze i.e., in the case when lenient is set we only add those combinations which are non-zero-length. The + // else would happen only in the case when the input or output is empty and lenient is set, in which case we + // quietly ignore it. For more details on the control-flow see SolrSynonymParser::addInternal. + if (lenient == false || (input.length > 0 && output.length > 0)) { + super.add(input, output, includeOrig); + } + } + + @Override + public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { + try { + return super.analyze(text, reuse); + } catch (IllegalArgumentException ex) { + if (lenient) { + logger.info("Synonym rule for [" + text + "] was ignored"); + return new CharsRef(""); + } else { + throw ex; + } + } + } +} diff --git a/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java index 803994e01dbe0..24dcb6d33fe84 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java @@ -23,7 +23,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.synonym.WordnetSynonymParser; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -57,11 +56,11 @@ public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rules try { SynonymMap.Builder parser; if ("wordnet".equalsIgnoreCase(format)) { - parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym); - ((WordnetSynonymParser) parser).parse(rulesReader); + parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym); + ((ESWordnetSynonymParser) parser).parse(rulesReader); } else { - parser = new ElasticsearchSynonymParser(true, expand, lenient, analyzerForParseSynonym); - ((ElasticsearchSynonymParser) parser).parse(rulesReader); + parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym); + ((ESSolrSynonymParser) parser).parse(rulesReader); } synonymMap = parser.build(); } catch (Exception e) { diff --git a/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java b/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java index a0ebbed82e231..61c9aba7a3eaf 100644 --- a/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java +++ b/server/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java @@ -23,7 +23,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.synonym.WordnetSynonymParser; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -94,11 +93,11 @@ public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader try { SynonymMap.Builder parser; if ("wordnet".equalsIgnoreCase(format)) { - parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym); - ((WordnetSynonymParser) parser).parse(rulesReader); + parser = new ESWordnetSynonymParser(true, expand, lenient, analyzerForParseSynonym); + ((ESWordnetSynonymParser) parser).parse(rulesReader); } else { - parser = new ElasticsearchSynonymParser(true, expand, lenient, analyzerForParseSynonym); - ((ElasticsearchSynonymParser) parser).parse(rulesReader); + parser = new ESSolrSynonymParser(true, expand, lenient, analyzerForParseSynonym); + ((ESSolrSynonymParser) parser).parse(rulesReader); } synonymMap = parser.build(); } catch (Exception e) { diff --git a/server/src/test/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParserTests.java b/server/src/test/java/org/elasticsearch/index/analysis/ESSolrSynonymParserTests.java similarity index 65% rename from server/src/test/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParserTests.java rename to server/src/test/java/org/elasticsearch/index/analysis/ESSolrSynonymParserTests.java index 9ca4c9ff842d9..31aa1a9be2512 100644 --- a/server/src/test/java/org/elasticsearch/index/analysis/ElasticsearchSynonymParserTests.java +++ b/server/src/test/java/org/elasticsearch/index/analysis/ESSolrSynonymParserTests.java @@ -19,6 +19,8 @@ package org.elasticsearch.index.analysis; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -33,10 +35,10 @@ import static org.hamcrest.Matchers.containsString; -public class ElasticsearchSynonymParserTests extends ESTokenStreamTestCase { +public class ESSolrSynonymParserTests extends ESTokenStreamTestCase { public void testLenientParser() throws IOException, ParseException { - ElasticsearchSynonymParser parser = new ElasticsearchSynonymParser(true, false, true, new StandardAnalyzer()); + ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, true, new StandardAnalyzer()); String rules = "&,and\n" + "come,advance,approach\n"; @@ -49,8 +51,23 @@ public void testLenientParser() throws IOException, ParseException { assertTokenStreamContents(ts, new String[]{"come", "quietly", "then", "come", "destroy"}); } + public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException { + CharArraySet stopSet = new CharArraySet(1, true); + stopSet.add("bar"); + ESSolrSynonymParser parser = + new ESSolrSynonymParser(true, false, true, new StandardAnalyzer(stopSet)); + String rules = "foo,bar,baz"; + StringReader rulesReader = new StringReader(rules); + parser.parse(rulesReader); + SynonymMap synonymMap = parser.build(); + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz")); + TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false); + assertTokenStreamContents(ts, new String[]{"first", "word", "is", "foo", "then", "and", "lastly", "foo"}); + } + public void testNonLenientParser() { - ElasticsearchSynonymParser parser = new ElasticsearchSynonymParser(true, false, false, new StandardAnalyzer()); + ESSolrSynonymParser parser = new ESSolrSynonymParser(true, false, false, new StandardAnalyzer()); String rules = "&,and=>and\n" + "come,advance,approach\n"; diff --git a/server/src/test/java/org/elasticsearch/index/analysis/ESWordnetSynonymParserTests.java b/server/src/test/java/org/elasticsearch/index/analysis/ESWordnetSynonymParserTests.java new file mode 100644 index 0000000000000..6d0fd8944d4c4 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/analysis/ESWordnetSynonymParserTests.java @@ -0,0 +1,88 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.synonym.SynonymFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.elasticsearch.test.ESTokenStreamTestCase; + +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; + +import static org.hamcrest.Matchers.containsString; + +public class ESWordnetSynonymParserTests extends ESTokenStreamTestCase { + + public void testLenientParser() throws IOException, ParseException { + ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, true, new StandardAnalyzer()); + String rules = + "s(100000001,1,'&',a,1,0).\n" + + "s(100000001,2,'and',a,1,0).\n" + + "s(100000002,1,'come',v,1,0).\n" + + "s(100000002,2,'advance',v,1,0).\n" + + "s(100000002,3,'approach',v,1,0)."; + StringReader rulesReader = new StringReader(rules); + parser.parse(rulesReader); + SynonymMap synonymMap = parser.build(); + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader("approach quietly then advance & destroy")); + TokenStream ts = new SynonymFilter(tokenizer, synonymMap, false); + assertTokenStreamContents(ts, new String[]{"come", "quietly", "then", "come", "destroy"}); + } + + public void testLenientParserWithSomeIncorrectLines() throws IOException, ParseException { + CharArraySet stopSet = new CharArraySet(1, true); + stopSet.add("bar"); + ESWordnetSynonymParser parser = + new ESWordnetSynonymParser(true, false, true, new StandardAnalyzer(stopSet)); + String rules = + "s(100000001,1,'foo',v,1,0).\n" + + "s(100000001,2,'bar',v,1,0).\n" + + "s(100000001,3,'baz',v,1,0)."; + StringReader rulesReader = new StringReader(rules); + parser.parse(rulesReader); + SynonymMap synonymMap = parser.build(); + Tokenizer tokenizer = new StandardTokenizer(); + tokenizer.setReader(new StringReader("first word is foo, then bar and lastly baz")); + TokenStream ts = new SynonymFilter(new StopFilter(tokenizer, stopSet), synonymMap, false); + assertTokenStreamContents(ts, new String[]{"first", "word", "is", "foo", "then", "and", "lastly", "foo"}); + } + + public void testNonLenientParser() { + ESWordnetSynonymParser parser = new ESWordnetSynonymParser(true, false, false, new StandardAnalyzer()); + String rules = + "s(100000001,1,'&',a,1,0).\n" + + "s(100000001,2,'and',a,1,0).\n" + + "s(100000002,1,'come',v,1,0).\n" + + "s(100000002,2,'advance',v,1,0).\n" + + "s(100000002,3,'approach',v,1,0)."; + StringReader rulesReader = new StringReader(rules); + ParseException ex = expectThrows(ParseException.class, () -> parser.parse(rulesReader)); + assertThat(ex.getMessage(), containsString("Invalid synonym rule at line 1")); + } + +} From 0780c744a0c7ee8e8deda490c344b8b862d1a055 Mon Sep 17 00:00:00 2001 From: Sohaib Iftikhar Date: Tue, 26 Jun 2018 19:47:52 +0200 Subject: [PATCH 5/6] Added additional documentation --- .../analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc | 4 +++- .../analysis/tokenfilters/synonym-tokenfilter.asciidoc | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc index 7846f6a9038cf..e70273109bd1b 100644 --- a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc @@ -87,7 +87,9 @@ PUT /test_index } -------------------------------------------------- // CONSOLE -With the above request the word `bar` gets skipped but a mapping `foo => baz` is still added. +With the above request the word `bar` gets skipped but a mapping `foo => baz` is still added. However, if the mapping +being added was "foo, baz => bar" or "bar, foo, baz" nothing would get added to the synonym list. This is because the +target word for the mapping is itself eliminated because it was a stop word. [float] ==== `tokenizer` and `ignore_case` are deprecated diff --git a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc index cfa6445492250..e28cdbfe419d1 100644 --- a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc @@ -74,7 +74,9 @@ PUT /test_index } -------------------------------------------------- // CONSOLE -With the above request the word `bar` gets skipped but a mapping `foo => baz` is still added. +With the above request the word `bar` gets skipped but a mapping `foo => baz` is still added. However, if the mapping +being added was "foo, baz => bar" or "bar, foo, baz" nothing would get added to the synonym list. This is because the +target word for the mapping is itself eliminated because it was a stop word. [float] From 198ebedf44e45f60850c49995a8709135df10063 Mon Sep 17 00:00:00 2001 From: Sohaib Iftikhar Date: Wed, 4 Jul 2018 01:52:16 +0200 Subject: [PATCH 6/6] Improved documentation --- .../tokenfilters/synonym-graph-tokenfilter.asciidoc | 7 +++++-- .../analysis/tokenfilters/synonym-tokenfilter.asciidoc | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc index e70273109bd1b..8be5647e10f27 100644 --- a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc @@ -88,8 +88,11 @@ PUT /test_index -------------------------------------------------- // CONSOLE With the above request the word `bar` gets skipped but a mapping `foo => baz` is still added. However, if the mapping -being added was "foo, baz => bar" or "bar, foo, baz" nothing would get added to the synonym list. This is because the -target word for the mapping is itself eliminated because it was a stop word. +being added was "foo, baz => bar" nothing would get added to the synonym list. This is because the target word for the +mapping is itself eliminated because it was a stop word. Similarly, if the mapping was "bar, foo, baz" and `expand` was +set to `false` no mapping would get added as when `expand=false` the target mapping is the first word. However, if +`expand=true` then the mappings added would be equivalent to `foo, baz => foo, baz` i.e, all mappings other than the +stop word. [float] ==== `tokenizer` and `ignore_case` are deprecated diff --git a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc index e28cdbfe419d1..5a6a84493ab60 100644 --- a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc @@ -75,8 +75,11 @@ PUT /test_index -------------------------------------------------- // CONSOLE With the above request the word `bar` gets skipped but a mapping `foo => baz` is still added. However, if the mapping -being added was "foo, baz => bar" or "bar, foo, baz" nothing would get added to the synonym list. This is because the -target word for the mapping is itself eliminated because it was a stop word. +being added was "foo, baz => bar" nothing would get added to the synonym list. This is because the target word for the +mapping is itself eliminated because it was a stop word. Similarly, if the mapping was "bar, foo, baz" and `expand` was +set to `false` no mapping would get added as when `expand=false` the target mapping is the first word. However, if +`expand=true` then the mappings added would be equivalent to `foo, baz => foo, baz` i.e, all mappings other than the +stop word. [float]