elastic · tlrx · Apr 12, 2021 · Apr 7, 2021 · Apr 8, 2021 · Apr 8, 2021
diff --git a/server/src/main/java/org/elasticsearch/index/engine/SegmentsStats.java b/server/src/main/java/org/elasticsearch/index/engine/SegmentsStats.java
@@ -16,6 +16,7 @@
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.xcontent.ToXContentFragment;
 import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.store.LuceneFilesExtensions;
 
 import java.io.IOException;
 
@@ -35,40 +36,6 @@ public class SegmentsStats implements Writeable, ToXContentFragment {
     private long bitsetMemoryInBytes;
     private ImmutableOpenMap<String, Long> fileSizes = ImmutableOpenMap.of();
 
-    /*
-     * A map to provide a best-effort approach describing Lucene index files.
-     *
-     * Ideally this should be in sync to what the current version of Lucene is using, but it's harmless to leave extensions out,
-     * they'll just miss a proper description in the stats
-     */
-    static final ImmutableOpenMap<String, String> FILE_DESCRIPTIONS = ImmutableOpenMap.<String, String>builder()
-            .fPut("si", "Segment Info")
-            .fPut("fnm", "Fields")
-            .fPut("fdm", "Field Metadata")
-            .fPut("fdx", "Field Index")
-            .fPut("fdt", "Field Data")
-            .fPut("tmd", "Term Dictionary Metadata")
-            .fPut("tim", "Term Dictionary")
-            .fPut("tip", "Term Index")
-            .fPut("doc", "Frequencies")
-            .fPut("pos", "Positions")
-            .fPut("pay", "Payloads")
-            .fPut("nvd", "Norms")
-            .fPut("nvm", "Norms metadata")
-            .fPut("kdm", "Points Metadata")
-            .fPut("kdi", "Points Index")
-            .fPut("kdm", "Points Metadata")
-            .fPut("kdi", "Points Index")   // old extension
-            .fPut("kdd", "Points")         // old extension
-            .fPut("dvd", "DocValues")
-            .fPut("dvm", "DocValues Metadata")
-            .fPut("tvm", "Term Vector Metadata")
-            .fPut("tvx", "Term Vector Index")
-            .fPut("tvd", "Term Vector Documents")
-            .fPut("tvf", "Term Vector Fields")
-            .fPut("liv", "Live Documents")
-            .build();
-
     public SegmentsStats() {}
 
     public SegmentsStats(StreamInput in) throws IOException {
@@ -321,7 +288,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
         for (ObjectObjectCursor<String, Long> entry : fileSizes) {
             builder.startObject(entry.key);
             builder.humanReadableField(Fields.SIZE_IN_BYTES, Fields.SIZE, new ByteSizeValue(entry.value));
-            builder.field(Fields.DESCRIPTION, FILE_DESCRIPTIONS.getOrDefault(entry.key, "Others"));
+            LuceneFilesExtensions extension = LuceneFilesExtensions.fromExtension(entry.key);
+            builder.field(Fields.DESCRIPTION, extension != null ? extension.getDescription() : "Others");
             builder.endObject();
         }
         builder.endObject();

diff --git a/server/src/main/java/org/elasticsearch/index/store/FsDirectoryFactory.java b/server/src/main/java/org/elasticsearch/index/store/FsDirectoryFactory.java
@@ -58,7 +58,7 @@ public Directory newDirectory(IndexSettings indexSettings, ShardPath path) throw
 
     protected Directory newFSDirectory(Path location, LockFactory lockFactory, IndexSettings indexSettings) throws IOException {
         final String storeType =
-                indexSettings.getSettings().get(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.FS.getSettingsKey());
+            indexSettings.getSettings().get(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.FS.getSettingsKey());
         IndexModule.Type type;
         if (IndexModule.Type.FS.match(storeType)) {
             type = IndexModule.defaultStoreType(IndexModule.NODE_STORE_ALLOW_MMAP.get(indexSettings.getNodeSettings()));
@@ -89,7 +89,7 @@ protected Directory newFSDirectory(Path location, LockFactory lockFactory, Index
     }
 
     public static MMapDirectory setPreload(MMapDirectory mMapDirectory, LockFactory lockFactory,
-            Set<String> preLoadExtensions) throws IOException {
+                                           Set<String> preLoadExtensions) throws IOException {
         assert mMapDirectory.getPreload() == false;
         if (preLoadExtensions.isEmpty() == false) {
             if (preLoadExtensions.contains("*")) {
@@ -145,35 +145,14 @@ boolean useDelegate(String name, IOContext ioContext) {
                 return false;
             }
 
-            String extension = FileSwitchDirectory.getExtension(name);
-            switch(extension) {
-                // Norms, doc values and term dictionaries are typically performance-sensitive and hot in the page
-                // cache, so we use mmap, which provides better performance.
-                case "nvd":
-                case "dvd":
-                case "tim":
-                // We want to open the terms index and KD-tree index off-heap to save memory, but this only performs
-                // well if using mmap.
-                case "tip":
-                // dim files only apply up to lucene 8.x indices. It can be removed once we are in lucene 10
-                case "dim":
-                case "kdd":
-                case "kdi":
-                // Compound files are tricky because they store all the information for the segment. Benchmarks
-                // suggested that not mapping them hurts performance.
-                case "cfs":
-                // MMapDirectory has special logic to read long[] arrays in little-endian order that helps speed
-                // up the decoding of postings. The same logic applies to positions (.pos) of offsets (.pay) but we
-                // are not mmaping them as queries that leverage positions are more costly and the decoding of postings
-                // tends to be less a bottleneck.
-                case "doc":
-                    return true;
+            final LuceneFilesExtensions extension = LuceneFilesExtensions.fromExtension(FileSwitchDirectory.getExtension(name));
+            if (extension == null || extension.shouldMmap() == false) {
                 // Other files are either less performance-sensitive (e.g. stored field index, norms metadata)
                 // or are large and have a random access pattern and mmap leads to page cache trashing
                 // (e.g. stored fields and term vectors).
-                default:
-                    return false;
+                return false;
             }
+            return true;
         }
 
         MMapDirectory getDelegate() {

diff --git a/server/src/main/java/org/elasticsearch/index/store/LuceneFilesExtensions.java b/server/src/main/java/org/elasticsearch/index/store/LuceneFilesExtensions.java
@@ -0,0 +1,136 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.index.store;
+
+import org.elasticsearch.common.Nullable;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+
+public enum LuceneFilesExtensions {
+
+    CFE("cfe", "Compound Files Entries", true, false),
+    // Compound files are tricky because they store all the information for the segment. Benchmarks
+    // suggested that not mapping them hurts performance.
+    CFS("cfs", "Compound Files", false, true),
+    CMP("cmp", "Completion Index", true, false),
+    DII("dii", "Points Index", false, false),
+    // dim files only apply up to lucene 8.x indices. It can be removed once we are in lucene 10
+    DIM("dim", "Points", false, true),
+    // MMapDirectory has special logic to read long[] arrays in little-endian order that helps speed
+    // up the decoding of postings. The same logic applies to positions (.pos) of offsets (.pay) but we
+    // are not mmaping them as queries that leverage positions are more costly and the decoding of postings
+    // tends to be less a bottleneck.
+    DOC("doc", "Frequencies", false, true),
+    // Doc values are typically performance-sensitive and hot in the page
+    // cache, so we use mmap, which provides better performance.
+    DVD("dvd", "DocValues", false, true),
+    DVM("dvm", "DocValues Metadata", true, false),
+    FDM("fdm", "Field Metadata", true, false),
+    FDT("fdt", "Field Data", false, false),
+    FDX("fdx", "Field Index", false, false),
+    FNM("fnm", "Fields", true, false),
+    // old extension
+    KDD("kdd", "Points", false, true),
+    // old extension
+    KDI("kdi", "Points Index", false, true),
+    // Lucene 8.6 point format metadata file
+    KDM("kdm", "Points Metadata", true, false),
+    LIV("liv", "Live Documents", false, false),
+    LKP("lkp", "Completion Dictionary", false, false),
+    // Norms are typically performance-sensitive and hot in the page
+    // cache, so we use mmap, which provides better performance.
+    NVD("nvd", "Norms", false, true),
+    NVM("nvm", "Norms Metadata", true, false),
+    PAY("pay", "Payloads", false, false),
+    POS("pos", "Positions", false, false),
+    SI("si", "Segment Info", true, false),
+    // Term dictionaries are typically performance-sensitive and hot in the page
+    // cache, so we use mmap, which provides better performance.
+    TIM("tim", "Term Dictionary", false, true),
+    // We want to open the terms index and KD-tree index off-heap to save memory, but this only performs
+    // well if using mmap.
+    TIP("tip", "Term Index", false, true),
+    // Lucene 8.6 terms metadata file
+    TMD("tmd", "Term Dictionary Metadata", true, false),
+    // Temporary Lucene file
+    TMP("tmp", "Temporary File", false, false),
+    TVD("tvd", "Term Vector Documents", false, false),
+    TVF("tvf", "Term Vector Fields", false, false),
+    TVM("tvm", "Term Vector Metadata", true, false),
+    TVX("tvx", "Term Vector Index", false, false),
+    VEC("vec", "Vector Data", false, false),
+    // Lucene 9.0 indexed vectors metadata
+    VEM("vem","Vector Metadata", true, false);
+
+    /**
+     * Lucene file's extension.
+     */
+    private final String extension;
+
+    /**
+     * Short description of the Lucene file
+     */
+    private final String description;
+
+    /**
+     * Some Lucene files should be memory-mapped when applicable.
+     */
+    private final boolean mmap;
+
+    /**
+     * Some Lucene files are considered as "metadata" files and should therefore be fully cached when applicable. Those files are usually
+     * fully read by Lucene when a Directory is opened. For non-metadata files Lucene usually only reads the header and footer checksums.
+     */
+    private final boolean metadata;
+
+    LuceneFilesExtensions(String extension, String description, boolean metadata, boolean mmap) {
+        this.description = Objects.requireNonNull(description);
+        this.extension = Objects.requireNonNull(extension);
+        this.metadata = metadata;
+        this.mmap = mmap;
+    }
+
+    public String getDescription() {
+        return description;
+    }
+
+    public String getExtension() {
+        return extension;
+    }
+
+    public boolean isMetadata() {
+        return metadata;
+    }
+
+    public boolean shouldMmap() {
+        return mmap;
+    }
+
+    private static final Map<String, LuceneFilesExtensions> extensions;
+    static {
+        final Map<String, LuceneFilesExtensions> map = new HashMap<>(values().length);
+        for (LuceneFilesExtensions extension : values()) {
+            map.put(extension.extension, extension);
+        }
+        extensions = Collections.unmodifiableMap(map);
+    }
+
+    @Nullable
+    public static LuceneFilesExtensions fromExtension(String ext) {
+        if (ext != null && ext.isEmpty() == false) {
+            final LuceneFilesExtensions extension = extensions.get(ext);
+            assert extension != null: "unknown Lucene file extension [" + ext + ']';
+            return extension;
+        }
+        return null;
+    }
+}
diff --git a/server/src/test/java/org/elasticsearch/index/engine/SegmentsStatsTests.java b/server/src/test/java/org/elasticsearch/index/engine/SegmentsStatsTests.java
@@ -22,6 +22,7 @@
 import org.apache.lucene.index.NoMergePolicy;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.store.Directory;
+import org.elasticsearch.index.store.LuceneFilesExtensions;
 import org.elasticsearch.test.ESTestCase;
 
 public class SegmentsStatsTests extends ESTestCase {
@@ -58,7 +59,7 @@ public void testFileExtensionDescriptions() throws Exception {
                 }
                 if (extension != null) {
                     assertNotNull("extension [" + extension + "] was not contained in the known segment stats files",
-                        SegmentsStats.FILE_DESCRIPTIONS.get(extension));
+                        LuceneFilesExtensions.fromExtension(extension));
                 }
             }
         }

diff --git a/server/src/test/java/org/elasticsearch/index/store/FsDirectoryFactoryTests.java b/server/src/test/java/org/elasticsearch/index/store/FsDirectoryFactoryTests.java
@@ -42,7 +42,7 @@ public void testPreload() throws IOException {
         doTestPreload("*");
         Settings build = Settings.builder()
             .put(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.HYBRIDFS.name().toLowerCase(Locale.ROOT))
-            .putList(IndexModule.INDEX_STORE_PRE_LOAD_SETTING.getKey(), "dvd", "bar")
+            .putList(IndexModule.INDEX_STORE_PRE_LOAD_SETTING.getKey(), "dvd", "tmp")
             .build();
         try (Directory directory = newDirectory(build)) {
             assertTrue(FsDirectoryFactory.isHybridFs(directory));
@@ -56,12 +56,12 @@ public void testPreload() throws IOException {
             assertTrue(hybridDirectory.useDelegate("foo.kdd", newIOContext(random())));
             assertTrue(hybridDirectory.useDelegate("foo.kdi", newIOContext(random())));
             assertFalse(hybridDirectory.useDelegate("foo.kdi", Store.READONCE_CHECKSUM));
-            assertFalse(hybridDirectory.useDelegate("foo.bar", newIOContext(random())));
+            assertFalse(hybridDirectory.useDelegate("foo.tmp", newIOContext(random())));
             MMapDirectory delegate = hybridDirectory.getDelegate();
             assertThat(delegate, Matchers.instanceOf(FsDirectoryFactory.PreLoadMMapDirectory.class));
             FsDirectoryFactory.PreLoadMMapDirectory preLoadMMapDirectory = (FsDirectoryFactory.PreLoadMMapDirectory) delegate;
             assertTrue(preLoadMMapDirectory.useDelegate("foo.dvd"));
-            assertTrue(preLoadMMapDirectory.useDelegate("foo.bar"));
+            assertTrue(preLoadMMapDirectory.useDelegate("foo.tmp"));
         }
     }
 
@@ -98,12 +98,12 @@ private void doTestPreload(String...preload) throws IOException {
                 assertFalse(preLoadMMapDirectory.useDelegate("XXX"));
                 assertFalse(preLoadMMapDirectory.getPreload());
                 preLoadMMapDirectory.close();
-                expectThrows(AlreadyClosedException.class, () -> preLoadMMapDirectory.getDelegate().openInput("foo.bar",
+                expectThrows(AlreadyClosedException.class, () -> preLoadMMapDirectory.getDelegate().openInput("foo.tmp",
                     IOContext.DEFAULT));
             }
         }
         expectThrows(AlreadyClosedException.class, () -> directory.openInput(randomBoolean() && preload.length != 0 ?
-            "foo." + preload[0] : "foo.bar", IOContext.DEFAULT));
+            "foo." + preload[0] : "foo.tmp", IOContext.DEFAULT));
     }
 
     public void testStoreDirectory() throws IOException {

diff --git a/test/framework/src/main/java/org/elasticsearch/common/lucene/store/ESIndexInputTestCase.java b/test/framework/src/main/java/org/elasticsearch/common/lucene/store/ESIndexInputTestCase.java
@@ -15,6 +15,7 @@
 import org.elasticsearch.common.util.concurrent.EsExecutors;
 import org.elasticsearch.common.util.concurrent.EsThreadPoolExecutor;
 import org.elasticsearch.common.util.concurrent.ThreadContext;
+import org.elasticsearch.index.store.LuceneFilesExtensions;
 import org.elasticsearch.test.ESTestCase;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@@ -196,33 +197,6 @@ private String randomUniqueSliceName() {
     }
 
     protected static String randomFileExtension() {
-        return randomFrom(
-            ".cfe",
-            ".cfs",
-            ".dii",
-            ".dim",
-            ".doc",
-            ".dvd",
-            ".dvm",
-            ".fdt",
-            ".fdx",
-            ".fdm",
-            ".fnm",
-            ".kdd",
-            ".kdi",
-            ".kdm",
-            ".liv",
-            ".nvd",
-            ".nvm",
-            ".pay",
-            ".pos",
-            ".tim",
-            ".tip",
-            ".tmd",
-            ".tvd",
-            ".tvx",
-            ".vec",
-            ".vem"
-        );
+        return '.' + randomFrom(LuceneFilesExtensions.values()).getExtension();
     }
 }