Skip to content

Commit 569d0c0

Browse files
authored
Improve similarity integration. (elastic#29187)
This improves the way similarities are plugged in in order to: - reject the classic similarity on 7.x indices and emit a deprecation warning otherwise - reject unkwown parameters on 7.x indices and emit a deprecation warning otherwise Even though this breaks the plugin API, I'd like to backport to 7.x so that users can get deprecation warnings when they are doing something that will become unsupported in the future. Closes elastic#23208 Closes elastic#29035
1 parent 8cdd950 commit 569d0c0

26 files changed

+513
-813
lines changed

docs/reference/index-modules/similarity.asciidoc

+2-16
Original file line numberDiff line numberDiff line change
@@ -82,20 +82,6 @@ This similarity has the following options:
8282

8383
Type name: `BM25`
8484

85-
[float]
86-
[[classic-similarity]]
87-
==== Classic similarity
88-
89-
The classic similarity that is based on the TF/IDF model. This
90-
similarity has the following option:
91-
92-
`discount_overlaps`::
93-
Determines whether overlap tokens (Tokens with
94-
0 position increment) are ignored when computing norm. By default this
95-
is true, meaning overlap tokens do not count when computing norms.
96-
97-
Type name: `classic`
98-
9985
[float]
10086
[[dfr]]
10187
==== DFR similarity
@@ -541,7 +527,7 @@ PUT /index
541527
"index": {
542528
"similarity": {
543529
"default": {
544-
"type": "classic"
530+
"type": "boolean"
545531
}
546532
}
547533
}
@@ -563,7 +549,7 @@ PUT /index/_settings
563549
"index": {
564550
"similarity": {
565551
"default": {
566-
"type": "classic"
552+
"type": "boolean"
567553
}
568554
}
569555
}

docs/reference/mapping/params/similarity.asciidoc

+2-7
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,9 @@ PUT my_index
4444
"default_field": { <1>
4545
"type": "text"
4646
},
47-
"classic_field": {
48-
"type": "text",
49-
"similarity": "classic" <2>
50-
},
5147
"boolean_sim_field": {
5248
"type": "text",
53-
"similarity": "boolean" <3>
49+
"similarity": "boolean" <2>
5450
}
5551
}
5652
}
@@ -59,5 +55,4 @@ PUT my_index
5955
--------------------------------------------------
6056
// CONSOLE
6157
<1> The `default_field` uses the `BM25` similarity.
62-
<2> The `classic_field` uses the `classic` similarity (ie TF/IDF).
63-
<3> The `boolean_sim_field` uses the `boolean` similarity.
58+
<2> The `boolean_sim_field` uses the `boolean` similarity.

docs/reference/migration/migrate_7_0/mappings.asciidoc

+13
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,16 @@ the index setting `index.mapping.nested_objects.limit`.
2424
==== The `update_all_types` option has been removed
2525

2626
This option is useless now that all indices have at most one type.
27+
28+
=== The `classic` similarity has been removed
29+
30+
The `classic` similarity relied on coordination factors for scoring to be good
31+
in presence of stopwords in the query. This feature has been removed from
32+
Lucene, which means that the `classic` similarity now produces scores of lower
33+
quality. It is advised to switch to `BM25` instead, which is widely accepted
34+
as a better alternative.
35+
36+
=== Similarities fail when unsupported options are provided
37+
38+
An error will now be thrown when unknown configuration options are provided
39+
to similarities. Such unknown parameters were ignored before.

modules/parent-join/src/test/java/org/elasticsearch/join/query/HasChildQueryBuilderTests.java

+1-3
Original file line numberDiff line numberDiff line change
@@ -336,9 +336,7 @@ public void testNonDefaultSimilarity() throws Exception {
336336
hasChildQuery(CHILD_DOC, new TermQueryBuilder("custom_string", "value"), ScoreMode.None);
337337
HasChildQueryBuilder.LateParsingQuery query = (HasChildQueryBuilder.LateParsingQuery) hasChildQueryBuilder.toQuery(shardContext);
338338
Similarity expected = SimilarityService.BUILT_IN.get(similarity)
339-
.create(similarity, Settings.EMPTY,
340-
Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(), null)
341-
.get();
339+
.apply(Settings.EMPTY, Version.CURRENT, null);
342340
assertThat(((PerFieldSimilarityWrapper) query.getSimilarity()).get("custom_string"), instanceOf(expected.getClass()));
343341
}
344342

modules/parent-join/src/test/java/org/elasticsearch/join/query/LegacyHasChildQueryBuilderTests.java

+2-4
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ protected Collection<Class<? extends Plugin>> getPlugins() {
8787

8888
@Override
8989
protected void initializeAdditionalMappings(MapperService mapperService) throws IOException {
90-
similarity = randomFrom("classic", "BM25");
90+
similarity = randomFrom("boolean", "BM25");
9191
// TODO: use a single type when inner hits have been changed to work with join field,
9292
// this test randomly generates queries with inner hits
9393
mapperService.merge(PARENT_TYPE, new CompressedXContent(Strings.toString(PutMappingRequest.buildFromSimplifiedDef(PARENT_TYPE,
@@ -323,9 +323,7 @@ public void testNonDefaultSimilarity() throws Exception {
323323
hasChildQuery(CHILD_TYPE, new TermQueryBuilder("custom_string", "value"), ScoreMode.None);
324324
HasChildQueryBuilder.LateParsingQuery query = (HasChildQueryBuilder.LateParsingQuery) hasChildQueryBuilder.toQuery(shardContext);
325325
Similarity expected = SimilarityService.BUILT_IN.get(similarity)
326-
.create(similarity, Settings.EMPTY,
327-
Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(), null)
328-
.get();
326+
.apply(Settings.EMPTY, Version.CURRENT, null);
329327
assertThat(((PerFieldSimilarityWrapper) query.getSimilarity()).get("custom_string"), instanceOf(expected.getClass()));
330328
}
331329

server/src/main/java/org/elasticsearch/cluster/metadata/MetaDataIndexUpgradeService.java

+7-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020

2121
import org.apache.logging.log4j.message.ParameterizedMessage;
2222
import org.apache.lucene.analysis.Analyzer;
23+
import org.apache.lucene.search.similarities.Similarity;
2324
import org.elasticsearch.Version;
25+
import org.elasticsearch.common.TriFunction;
2426
import org.elasticsearch.common.component.AbstractComponent;
2527
import org.elasticsearch.common.settings.IndexScopedSettings;
2628
import org.elasticsearch.common.settings.Settings;
@@ -31,8 +33,8 @@
3133
import org.elasticsearch.index.analysis.NamedAnalyzer;
3234
import org.elasticsearch.index.mapper.MapperService;
3335
import org.elasticsearch.index.similarity.SimilarityService;
34-
import org.elasticsearch.index.similarity.SimilarityProvider;
3536
import org.elasticsearch.indices.mapper.MapperRegistry;
37+
import org.elasticsearch.script.ScriptService;
3638

3739
import java.util.AbstractMap;
3840
import java.util.Collection;
@@ -142,22 +144,23 @@ private void checkMappingsCompatibility(IndexMetaData indexMetaData) {
142144

143145
IndexSettings indexSettings = new IndexSettings(indexMetaData, this.settings);
144146

145-
final Map<String, SimilarityProvider.Factory> similarityMap = new AbstractMap<String, SimilarityProvider.Factory>() {
147+
final Map<String, TriFunction<Settings, Version, ScriptService, Similarity>> similarityMap
148+
= new AbstractMap<String, TriFunction<Settings, Version, ScriptService, Similarity>>() {
146149
@Override
147150
public boolean containsKey(Object key) {
148151
return true;
149152
}
150153

151154
@Override
152-
public SimilarityProvider.Factory get(Object key) {
155+
public TriFunction<Settings, Version, ScriptService, Similarity> get(Object key) {
153156
assert key instanceof String : "key must be a string but was: " + key.getClass();
154157
return SimilarityService.BUILT_IN.get(SimilarityService.DEFAULT_SIMILARITY);
155158
}
156159

157160
// this entrySet impl isn't fully correct but necessary as SimilarityService will iterate
158161
// over all similarities
159162
@Override
160-
public Set<Entry<String, SimilarityProvider.Factory>> entrySet() {
163+
public Set<Entry<String, TriFunction<Settings, Version, ScriptService, Similarity>>> entrySet() {
161164
return Collections.emptySet();
162165
}
163166
};

server/src/main/java/org/elasticsearch/index/IndexModule.java

+15-9
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,13 @@
1919

2020
package org.elasticsearch.index;
2121

22+
import org.apache.lucene.search.similarities.Similarity;
23+
import org.apache.lucene.search.similarities.BM25Similarity;
2224
import org.apache.lucene.util.SetOnce;
25+
import org.elasticsearch.Version;
2326
import org.elasticsearch.client.Client;
2427
import org.elasticsearch.common.Strings;
28+
import org.elasticsearch.common.TriFunction;
2529
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
2630
import org.elasticsearch.common.settings.Setting;
2731
import org.elasticsearch.common.settings.Setting.Property;
@@ -39,9 +43,6 @@
3943
import org.elasticsearch.index.shard.IndexSearcherWrapper;
4044
import org.elasticsearch.index.shard.IndexingOperationListener;
4145
import org.elasticsearch.index.shard.SearchOperationListener;
42-
import org.elasticsearch.index.shard.ShardId;
43-
import org.elasticsearch.index.similarity.BM25SimilarityProvider;
44-
import org.elasticsearch.index.similarity.SimilarityProvider;
4546
import org.elasticsearch.index.similarity.SimilarityService;
4647
import org.elasticsearch.index.store.IndexStore;
4748
import org.elasticsearch.indices.IndicesQueryCache;
@@ -68,10 +69,10 @@
6869
/**
6970
* IndexModule represents the central extension point for index level custom implementations like:
7071
* <ul>
71-
* <li>{@link SimilarityProvider} - New {@link SimilarityProvider} implementations can be registered through
72-
* {@link #addSimilarity(String, SimilarityProvider.Factory)} while existing Providers can be referenced through Settings under the
72+
* <li>{@link Similarity} - New {@link Similarity} implementations can be registered through
73+
* {@link #addSimilarity(String, TriFunction)} while existing Providers can be referenced through Settings under the
7374
* {@link IndexModule#SIMILARITY_SETTINGS_PREFIX} prefix along with the "type" value. For example, to reference the
74-
* {@link BM25SimilarityProvider}, the configuration <tt>"index.similarity.my_similarity.type : "BM25"</tt> can be used.</li>
75+
* {@link BM25Similarity}, the configuration <tt>"index.similarity.my_similarity.type : "BM25"</tt> can be used.</li>
7576
* <li>{@link IndexStore} - Custom {@link IndexStore} instances can be registered via {@link #addIndexStore(String, Function)}</li>
7677
* <li>{@link IndexEventListener} - Custom {@link IndexEventListener} instances can be registered via
7778
* {@link #addIndexEventListener(IndexEventListener)}</li>
@@ -107,7 +108,7 @@ public final class IndexModule {
107108
final SetOnce<EngineFactory> engineFactory = new SetOnce<>();
108109
private SetOnce<IndexSearcherWrapperFactory> indexSearcherWrapper = new SetOnce<>();
109110
private final Set<IndexEventListener> indexEventListeners = new HashSet<>();
110-
private final Map<String, SimilarityProvider.Factory> similarities = new HashMap<>();
111+
private final Map<String, TriFunction<Settings, Version, ScriptService, Similarity>> similarities = new HashMap<>();
111112
private final Map<String, Function<IndexSettings, IndexStore>> storeTypes = new HashMap<>();
112113
private final SetOnce<BiFunction<IndexSettings, IndicesQueryCache, QueryCache>> forceQueryCacheProvider = new SetOnce<>();
113114
private final List<SearchOperationListener> searchOperationListeners = new ArrayList<>();
@@ -246,12 +247,17 @@ public void addIndexStore(String type, Function<IndexSettings, IndexStore> provi
246247

247248

248249
/**
249-
* Registers the given {@link SimilarityProvider} with the given name
250+
* Registers the given {@link Similarity} with the given name.
251+
* The function takes as parameters:<ul>
252+
* <li>settings for this similarity
253+
* <li>version of Elasticsearch when the index was created
254+
* <li>ScriptService, for script-based similarities
255+
* </ul>
250256
*
251257
* @param name Name of the SimilarityProvider
252258
* @param similarity SimilarityProvider to register
253259
*/
254-
public void addSimilarity(String name, SimilarityProvider.Factory similarity) {
260+
public void addSimilarity(String name, TriFunction<Settings, Version, ScriptService, Similarity> similarity) {
255261
ensureNotFrozen();
256262
if (similarities.containsKey(name) || SimilarityService.BUILT_IN.containsKey(name)) {
257263
throw new IllegalArgumentException("similarity for name: [" + name + " is already registered");

server/src/main/java/org/elasticsearch/index/similarity/AbstractSimilarityProvider.java

-82
This file was deleted.

server/src/main/java/org/elasticsearch/index/similarity/BM25SimilarityProvider.java

-59
This file was deleted.

0 commit comments

Comments
 (0)