19
19
20
20
package org .elasticsearch .index .similarity ;
21
21
22
+ import org .apache .logging .log4j .LogManager ;
23
+ import org .apache .lucene .index .BinaryDocValues ;
24
+ import org .apache .lucene .index .FieldInfos ;
25
+ import org .apache .lucene .index .FieldInvertState ;
26
+ import org .apache .lucene .index .Fields ;
27
+ import org .apache .lucene .index .LeafMetaData ;
28
+ import org .apache .lucene .index .LeafReader ;
29
+ import org .apache .lucene .index .NumericDocValues ;
30
+ import org .apache .lucene .index .PointValues ;
31
+ import org .apache .lucene .index .SortedDocValues ;
32
+ import org .apache .lucene .index .SortedNumericDocValues ;
33
+ import org .apache .lucene .index .SortedSetDocValues ;
34
+ import org .apache .lucene .index .StoredFieldVisitor ;
35
+ import org .apache .lucene .index .Terms ;
36
+ import org .apache .lucene .search .CollectionStatistics ;
37
+ import org .apache .lucene .search .Explanation ;
38
+ import org .apache .lucene .search .TermStatistics ;
22
39
import org .apache .lucene .search .similarities .BM25Similarity ;
23
40
import org .apache .lucene .search .similarities .BooleanSimilarity ;
24
41
import org .apache .lucene .search .similarities .ClassicSimilarity ;
25
42
import org .apache .lucene .search .similarities .PerFieldSimilarityWrapper ;
26
43
import org .apache .lucene .search .similarities .Similarity ;
44
+ import org .apache .lucene .search .similarities .Similarity .SimScorer ;
45
+ import org .apache .lucene .search .similarities .Similarity .SimWeight ;
46
+ import org .apache .lucene .util .Bits ;
47
+ import org .apache .lucene .util .BytesRef ;
27
48
import org .elasticsearch .Version ;
28
49
import org .elasticsearch .common .TriFunction ;
29
50
import org .elasticsearch .common .logging .DeprecationLogger ;
30
- import org .elasticsearch .common .logging .Loggers ;
31
51
import org .elasticsearch .common .settings .Settings ;
32
52
import org .elasticsearch .index .AbstractIndexComponent ;
33
53
import org .elasticsearch .index .IndexModule ;
36
56
import org .elasticsearch .index .mapper .MapperService ;
37
57
import org .elasticsearch .script .ScriptService ;
38
58
59
+ import java .io .IOException ;
60
+ import java .io .UncheckedIOException ;
39
61
import java .util .Collections ;
40
62
import java .util .HashMap ;
41
63
import java .util .Map ;
44
66
45
67
public final class SimilarityService extends AbstractIndexComponent {
46
68
47
- private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger (Loggers .getLogger (SimilarityService .class ));
69
+ private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger (LogManager .getLogger (SimilarityService .class ));
48
70
public static final String DEFAULT_SIMILARITY = "BM25" ;
49
71
private static final String CLASSIC_SIMILARITY = "classic" ;
50
72
private static final Map <String , Function <Version , Supplier <Similarity >>> DEFAULTS ;
@@ -120,7 +142,8 @@ public SimilarityService(IndexSettings indexSettings, ScriptService scriptServic
120
142
}
121
143
TriFunction <Settings , Version , ScriptService , Similarity > defaultFactory = BUILT_IN .get (typeName );
122
144
TriFunction <Settings , Version , ScriptService , Similarity > factory = similarities .getOrDefault (typeName , defaultFactory );
123
- final Similarity similarity = factory .apply (providerSettings , indexSettings .getIndexVersionCreated (), scriptService );
145
+ Similarity similarity = factory .apply (providerSettings , indexSettings .getIndexVersionCreated (), scriptService );
146
+ validateSimilarity (indexSettings .getIndexVersionCreated (), similarity );
124
147
providers .put (name , () -> similarity );
125
148
}
126
149
for (Map .Entry <String , Function <Version , Supplier <Similarity >>> entry : DEFAULTS .entrySet ()) {
@@ -140,7 +163,7 @@ public Similarity similarity(MapperService mapperService) {
140
163
defaultSimilarity ;
141
164
}
142
165
143
-
166
+
144
167
public SimilarityProvider getSimilarity (String name ) {
145
168
Supplier <Similarity > sim = similarities .get (name );
146
169
if (sim == null ) {
@@ -171,4 +194,231 @@ public Similarity get(String name) {
171
194
return (fieldType != null && fieldType .similarity () != null ) ? fieldType .similarity ().get () : defaultSimilarity ;
172
195
}
173
196
}
197
+
198
+ static void validateSimilarity (Version indexCreatedVersion , Similarity similarity ) {
199
+ try {
200
+ validateScoresArePositive (indexCreatedVersion , similarity );
201
+ validateScoresDoNotDecreaseWithFreq (indexCreatedVersion , similarity );
202
+ validateScoresDoNotIncreaseWithNorm (indexCreatedVersion , similarity );
203
+ } catch (IOException e ) {
204
+ throw new UncheckedIOException (e );
205
+ }
206
+ }
207
+
208
+ private static class SingleNormLeafReader extends LeafReader {
209
+
210
+ private final long norm ;
211
+
212
+ SingleNormLeafReader (long norm ) {
213
+ this .norm = norm ;
214
+ }
215
+
216
+ @ Override
217
+ public CacheHelper getCoreCacheHelper () {
218
+ return null ;
219
+ }
220
+
221
+ @ Override
222
+ public Terms terms (String field ) throws IOException {
223
+ throw new UnsupportedOperationException ();
224
+ }
225
+
226
+ @ Override
227
+ public NumericDocValues getNumericDocValues (String field ) throws IOException {
228
+ throw new UnsupportedOperationException ();
229
+ }
230
+
231
+ @ Override
232
+ public BinaryDocValues getBinaryDocValues (String field ) throws IOException {
233
+ throw new UnsupportedOperationException ();
234
+ }
235
+
236
+ @ Override
237
+ public SortedDocValues getSortedDocValues (String field ) throws IOException {
238
+ throw new UnsupportedOperationException ();
239
+ }
240
+
241
+ @ Override
242
+ public SortedNumericDocValues getSortedNumericDocValues (String field ) throws IOException {
243
+ throw new UnsupportedOperationException ();
244
+ }
245
+
246
+ @ Override
247
+ public SortedSetDocValues getSortedSetDocValues (String field ) throws IOException {
248
+ throw new UnsupportedOperationException ();
249
+ }
250
+
251
+ @ Override
252
+ public NumericDocValues getNormValues (String field ) throws IOException {
253
+ return new NumericDocValues () {
254
+
255
+ int doc = -1 ;
256
+
257
+ @ Override
258
+ public long longValue () throws IOException {
259
+ return norm ;
260
+ }
261
+
262
+ @ Override
263
+ public boolean advanceExact (int target ) throws IOException {
264
+ doc = target ;
265
+ return true ;
266
+ }
267
+
268
+ @ Override
269
+ public int docID () {
270
+ return doc ;
271
+ }
272
+
273
+ @ Override
274
+ public int nextDoc () throws IOException {
275
+ return advance (doc + 1 );
276
+ }
277
+
278
+ @ Override
279
+ public int advance (int target ) throws IOException {
280
+ if (target == 0 ) {
281
+ return doc = 0 ;
282
+ } else {
283
+ return doc = NO_MORE_DOCS ;
284
+ }
285
+ }
286
+
287
+ @ Override
288
+ public long cost () {
289
+ return 1 ;
290
+ }
291
+
292
+ };
293
+ }
294
+
295
+ @ Override
296
+ public FieldInfos getFieldInfos () {
297
+ throw new UnsupportedOperationException ();
298
+ }
299
+
300
+ @ Override
301
+ public Bits getLiveDocs () {
302
+ return null ;
303
+ }
304
+
305
+ @ Override
306
+ public PointValues getPointValues (String field ) throws IOException {
307
+ throw new UnsupportedOperationException ();
308
+ }
309
+
310
+ @ Override
311
+ public void checkIntegrity () throws IOException {}
312
+
313
+ @ Override
314
+ public LeafMetaData getMetaData () {
315
+ return new LeafMetaData (
316
+ org .apache .lucene .util .Version .LATEST .major ,
317
+ org .apache .lucene .util .Version .LATEST ,
318
+ null );
319
+ }
320
+
321
+ @ Override
322
+ public Fields getTermVectors (int docID ) throws IOException {
323
+ throw new UnsupportedOperationException ();
324
+ }
325
+
326
+ @ Override
327
+ public int numDocs () {
328
+ return 1 ;
329
+ }
330
+
331
+ @ Override
332
+ public int maxDoc () {
333
+ return 1 ;
334
+ }
335
+
336
+ @ Override
337
+ public void document (int docID , StoredFieldVisitor visitor ) throws IOException {
338
+ throw new UnsupportedOperationException ();
339
+ }
340
+
341
+ @ Override
342
+ protected void doClose () throws IOException {
343
+ }
344
+
345
+ @ Override
346
+ public CacheHelper getReaderCacheHelper () {
347
+ throw new UnsupportedOperationException ();
348
+ }
349
+
350
+ }
351
+
352
+ private static void validateScoresArePositive (Version indexCreatedVersion , Similarity similarity ) throws IOException {
353
+ CollectionStatistics collectionStats = new CollectionStatistics ("some_field" , 1200 , 1100 , 3000 , 2000 );
354
+ TermStatistics termStats = new TermStatistics (new BytesRef ("some_value" ), 100 , 130 );
355
+ SimWeight simWeight = similarity .computeWeight (2f , collectionStats , termStats );
356
+ FieldInvertState state = new FieldInvertState (indexCreatedVersion .luceneVersion .major ,
357
+ "some_field" , 20 , 20 , 0 , 50 ); // length = 20, no overlap
358
+ final long norm = similarity .computeNorm (state );
359
+ LeafReader reader = new SingleNormLeafReader (norm );
360
+ SimScorer scorer = similarity .simScorer (simWeight , reader .getContext ());
361
+ for (int freq = 1 ; freq <= 10 ; ++freq ) {
362
+ float score = scorer .score (0 , freq );
363
+ if (score < 0 ) {
364
+ DEPRECATION_LOGGER .deprecated ("Similarities should not return negative scores:\n " +
365
+ scorer .explain (0 , Explanation .match (freq , "term freq" )));
366
+ break ;
367
+ }
368
+ }
369
+ }
370
+
371
+ private static void validateScoresDoNotDecreaseWithFreq (Version indexCreatedVersion , Similarity similarity ) throws IOException {
372
+ CollectionStatistics collectionStats = new CollectionStatistics ("some_field" , 1200 , 1100 , 3000 , 2000 );
373
+ TermStatistics termStats = new TermStatistics (new BytesRef ("some_value" ), 100 , 130 );
374
+ SimWeight simWeight = similarity .computeWeight (2f , collectionStats , termStats );
375
+ FieldInvertState state = new FieldInvertState (indexCreatedVersion .luceneVersion .major ,
376
+ "some_field" , 20 , 20 , 0 , 50 ); // length = 20, no overlap
377
+ final long norm = similarity .computeNorm (state );
378
+ LeafReader reader = new SingleNormLeafReader (norm );
379
+ SimScorer scorer = similarity .simScorer (simWeight , reader .getContext ());
380
+ float previousScore = Float .NEGATIVE_INFINITY ;
381
+ for (int freq = 1 ; freq <= 10 ; ++freq ) {
382
+ float score = scorer .score (0 , freq );
383
+ if (score < previousScore ) {
384
+ DEPRECATION_LOGGER .deprecated ("Similarity scores should not decrease when term frequency increases:\n " +
385
+ scorer .explain (0 , Explanation .match (freq - 1 , "term freq" )) + "\n " +
386
+ scorer .explain (0 , Explanation .match (freq , "term freq" )));
387
+ break ;
388
+ }
389
+ previousScore = score ;
390
+ }
391
+ }
392
+
393
+ private static void validateScoresDoNotIncreaseWithNorm (Version indexCreatedVersion , Similarity similarity ) throws IOException {
394
+ CollectionStatistics collectionStats = new CollectionStatistics ("some_field" , 1200 , 1100 , 3000 , 2000 );
395
+ TermStatistics termStats = new TermStatistics (new BytesRef ("some_value" ), 100 , 130 );
396
+ SimWeight simWeight = similarity .computeWeight (2f , collectionStats , termStats );
397
+
398
+ SimScorer previousScorer = null ;
399
+ long previousNorm = 0 ;
400
+ float previousScore = Float .POSITIVE_INFINITY ;
401
+ for (int length = 1 ; length <= 10 ; ++length ) {
402
+ FieldInvertState state = new FieldInvertState (indexCreatedVersion .luceneVersion .major ,
403
+ "some_field" , length , length , 0 , 50 ); // length = 20, no overlap
404
+ final long norm = similarity .computeNorm (state );
405
+ if (Long .compareUnsigned (previousNorm , norm ) > 0 ) {
406
+ // esoteric similarity, skip this check
407
+ break ;
408
+ }
409
+ LeafReader reader = new SingleNormLeafReader (norm );
410
+ SimScorer scorer = similarity .simScorer (simWeight , reader .getContext ());
411
+ float score = scorer .score (0 , 1 );
412
+ if (score > previousScore ) {
413
+ DEPRECATION_LOGGER .deprecated ("Similarity scores should not increase when norm increases:\n " +
414
+ previousScorer .explain (0 , Explanation .match (1 , "term freq" )) + "\n " +
415
+ scorer .explain (0 , Explanation .match (1 , "term freq" )));
416
+ break ;
417
+ }
418
+ previousScorer = scorer ;
419
+ previousScore = score ;
420
+ previousNorm = norm ;
421
+ }
422
+ }
423
+
174
424
}
0 commit comments