Skip to content

Commit 472a1fc

Browse files
authored
[7.x] Centralize Lucene files extensions in one place (#71568)
Elasticsearch enumerates Lucene files extensions for various purposes: grouping files in segment stats under a description, mapping files in memory through HybridDirectory or adjusting the caching strategy for Lucene files in searchable snapshots. But when a new extension is handled somewhere(let's say, added to the list of files to mmap) it is easy to forget to add it in other places. This commit is an attempt to centralize in a single place all known Lucene files extensions in Elasticsearch. Backport of #71416
1 parent 964b626 commit 472a1fc

File tree

7 files changed

+161
-146
lines changed

7 files changed

+161
-146
lines changed

server/src/main/java/org/elasticsearch/index/engine/SegmentsStats.java

Lines changed: 3 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.elasticsearch.common.unit.ByteSizeValue;
1717
import org.elasticsearch.common.xcontent.ToXContentFragment;
1818
import org.elasticsearch.common.xcontent.XContentBuilder;
19+
import org.elasticsearch.index.store.LuceneFilesExtensions;
1920

2021
import java.io.IOException;
2122

@@ -35,40 +36,6 @@ public class SegmentsStats implements Writeable, ToXContentFragment {
3536
private long bitsetMemoryInBytes;
3637
private ImmutableOpenMap<String, Long> fileSizes = ImmutableOpenMap.of();
3738

38-
/*
39-
* A map to provide a best-effort approach describing Lucene index files.
40-
*
41-
* Ideally this should be in sync to what the current version of Lucene is using, but it's harmless to leave extensions out,
42-
* they'll just miss a proper description in the stats
43-
*/
44-
static final ImmutableOpenMap<String, String> FILE_DESCRIPTIONS = ImmutableOpenMap.<String, String>builder()
45-
.fPut("si", "Segment Info")
46-
.fPut("fnm", "Fields")
47-
.fPut("fdm", "Field Metadata")
48-
.fPut("fdx", "Field Index")
49-
.fPut("fdt", "Field Data")
50-
.fPut("tmd", "Term Dictionary Metadata")
51-
.fPut("tim", "Term Dictionary")
52-
.fPut("tip", "Term Index")
53-
.fPut("doc", "Frequencies")
54-
.fPut("pos", "Positions")
55-
.fPut("pay", "Payloads")
56-
.fPut("nvd", "Norms")
57-
.fPut("nvm", "Norms metadata")
58-
.fPut("kdm", "Points Metadata")
59-
.fPut("kdi", "Points Index")
60-
.fPut("kdm", "Points Metadata")
61-
.fPut("kdi", "Points Index") // old extension
62-
.fPut("kdd", "Points") // old extension
63-
.fPut("dvd", "DocValues")
64-
.fPut("dvm", "DocValues Metadata")
65-
.fPut("tvm", "Term Vector Metadata")
66-
.fPut("tvx", "Term Vector Index")
67-
.fPut("tvd", "Term Vector Documents")
68-
.fPut("tvf", "Term Vector Fields")
69-
.fPut("liv", "Live Documents")
70-
.build();
71-
7239
public SegmentsStats() {}
7340

7441
public SegmentsStats(StreamInput in) throws IOException {
@@ -321,7 +288,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
321288
for (ObjectObjectCursor<String, Long> entry : fileSizes) {
322289
builder.startObject(entry.key);
323290
builder.humanReadableField(Fields.SIZE_IN_BYTES, Fields.SIZE, new ByteSizeValue(entry.value));
324-
builder.field(Fields.DESCRIPTION, FILE_DESCRIPTIONS.getOrDefault(entry.key, "Others"));
291+
LuceneFilesExtensions extension = LuceneFilesExtensions.fromExtension(entry.key);
292+
builder.field(Fields.DESCRIPTION, extension != null ? extension.getDescription() : "Others");
325293
builder.endObject();
326294
}
327295
builder.endObject();

server/src/main/java/org/elasticsearch/index/store/FsDirectoryFactory.java

Lines changed: 6 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ public Directory newDirectory(IndexSettings indexSettings, ShardPath path) throw
5858

5959
protected Directory newFSDirectory(Path location, LockFactory lockFactory, IndexSettings indexSettings) throws IOException {
6060
final String storeType =
61-
indexSettings.getSettings().get(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.FS.getSettingsKey());
61+
indexSettings.getSettings().get(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.FS.getSettingsKey());
6262
IndexModule.Type type;
6363
if (IndexModule.Type.FS.match(storeType)) {
6464
type = IndexModule.defaultStoreType(IndexModule.NODE_STORE_ALLOW_MMAP.get(indexSettings.getNodeSettings()));
@@ -89,7 +89,7 @@ protected Directory newFSDirectory(Path location, LockFactory lockFactory, Index
8989
}
9090

9191
public static MMapDirectory setPreload(MMapDirectory mMapDirectory, LockFactory lockFactory,
92-
Set<String> preLoadExtensions) throws IOException {
92+
Set<String> preLoadExtensions) throws IOException {
9393
assert mMapDirectory.getPreload() == false;
9494
if (preLoadExtensions.isEmpty() == false) {
9595
if (preLoadExtensions.contains("*")) {
@@ -145,35 +145,14 @@ boolean useDelegate(String name, IOContext ioContext) {
145145
return false;
146146
}
147147

148-
String extension = FileSwitchDirectory.getExtension(name);
149-
switch(extension) {
150-
// Norms, doc values and term dictionaries are typically performance-sensitive and hot in the page
151-
// cache, so we use mmap, which provides better performance.
152-
case "nvd":
153-
case "dvd":
154-
case "tim":
155-
// We want to open the terms index and KD-tree index off-heap to save memory, but this only performs
156-
// well if using mmap.
157-
case "tip":
158-
// dim files only apply up to lucene 8.x indices. It can be removed once we are in lucene 10
159-
case "dim":
160-
case "kdd":
161-
case "kdi":
162-
// Compound files are tricky because they store all the information for the segment. Benchmarks
163-
// suggested that not mapping them hurts performance.
164-
case "cfs":
165-
// MMapDirectory has special logic to read long[] arrays in little-endian order that helps speed
166-
// up the decoding of postings. The same logic applies to positions (.pos) of offsets (.pay) but we
167-
// are not mmaping them as queries that leverage positions are more costly and the decoding of postings
168-
// tends to be less a bottleneck.
169-
case "doc":
170-
return true;
148+
final LuceneFilesExtensions extension = LuceneFilesExtensions.fromExtension(FileSwitchDirectory.getExtension(name));
149+
if (extension == null || extension.shouldMmap() == false) {
171150
// Other files are either less performance-sensitive (e.g. stored field index, norms metadata)
172151
// or are large and have a random access pattern and mmap leads to page cache trashing
173152
// (e.g. stored fields and term vectors).
174-
default:
175-
return false;
153+
return false;
176154
}
155+
return true;
177156
}
178157

179158
MMapDirectory getDelegate() {
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0 and the Server Side Public License, v 1; you may not use this file except
5+
* in compliance with, at your election, the Elastic License 2.0 or the Server
6+
* Side Public License, v 1.
7+
*/
8+
9+
package org.elasticsearch.index.store;
10+
11+
import org.elasticsearch.common.Nullable;
12+
13+
import java.util.Collections;
14+
import java.util.HashMap;
15+
import java.util.Map;
16+
import java.util.Objects;
17+
18+
public enum LuceneFilesExtensions {
19+
20+
CFE("cfe", "Compound Files Entries", true, false),
21+
// Compound files are tricky because they store all the information for the segment. Benchmarks
22+
// suggested that not mapping them hurts performance.
23+
CFS("cfs", "Compound Files", false, true),
24+
CMP("cmp", "Completion Index", true, false),
25+
DII("dii", "Points Index", false, false),
26+
// dim files only apply up to lucene 8.x indices. It can be removed once we are in lucene 10
27+
DIM("dim", "Points", false, true),
28+
// MMapDirectory has special logic to read long[] arrays in little-endian order that helps speed
29+
// up the decoding of postings. The same logic applies to positions (.pos) of offsets (.pay) but we
30+
// are not mmaping them as queries that leverage positions are more costly and the decoding of postings
31+
// tends to be less a bottleneck.
32+
DOC("doc", "Frequencies", false, true),
33+
// Doc values are typically performance-sensitive and hot in the page
34+
// cache, so we use mmap, which provides better performance.
35+
DVD("dvd", "DocValues", false, true),
36+
DVM("dvm", "DocValues Metadata", true, false),
37+
FDM("fdm", "Field Metadata", true, false),
38+
FDT("fdt", "Field Data", false, false),
39+
FDX("fdx", "Field Index", false, false),
40+
FNM("fnm", "Fields", true, false),
41+
// old extension
42+
KDD("kdd", "Points", false, true),
43+
// old extension
44+
KDI("kdi", "Points Index", false, true),
45+
// Lucene 8.6 point format metadata file
46+
KDM("kdm", "Points Metadata", true, false),
47+
LIV("liv", "Live Documents", false, false),
48+
LKP("lkp", "Completion Dictionary", false, false),
49+
// Norms are typically performance-sensitive and hot in the page
50+
// cache, so we use mmap, which provides better performance.
51+
NVD("nvd", "Norms", false, true),
52+
NVM("nvm", "Norms Metadata", true, false),
53+
PAY("pay", "Payloads", false, false),
54+
POS("pos", "Positions", false, false),
55+
SI("si", "Segment Info", true, false),
56+
// Term dictionaries are typically performance-sensitive and hot in the page
57+
// cache, so we use mmap, which provides better performance.
58+
TIM("tim", "Term Dictionary", false, true),
59+
// We want to open the terms index and KD-tree index off-heap to save memory, but this only performs
60+
// well if using mmap.
61+
TIP("tip", "Term Index", false, true),
62+
// Lucene 8.6 terms metadata file
63+
TMD("tmd", "Term Dictionary Metadata", true, false),
64+
// Temporary Lucene file
65+
TMP("tmp", "Temporary File", false, false),
66+
TVD("tvd", "Term Vector Documents", false, false),
67+
TVF("tvf", "Term Vector Fields", false, false),
68+
TVM("tvm", "Term Vector Metadata", true, false),
69+
TVX("tvx", "Term Vector Index", false, false),
70+
VEC("vec", "Vector Data", false, false),
71+
// Lucene 9.0 indexed vectors metadata
72+
VEM("vem","Vector Metadata", true, false);
73+
74+
/**
75+
* Lucene file's extension.
76+
*/
77+
private final String extension;
78+
79+
/**
80+
* Short description of the Lucene file
81+
*/
82+
private final String description;
83+
84+
/**
85+
* Some Lucene files should be memory-mapped when applicable.
86+
*/
87+
private final boolean mmap;
88+
89+
/**
90+
* Some Lucene files are considered as "metadata" files and should therefore be fully cached when applicable. Those files are usually
91+
* fully read by Lucene when a Directory is opened. For non-metadata files Lucene usually only reads the header and footer checksums.
92+
*/
93+
private final boolean metadata;
94+
95+
LuceneFilesExtensions(String extension, String description, boolean metadata, boolean mmap) {
96+
this.description = Objects.requireNonNull(description);
97+
this.extension = Objects.requireNonNull(extension);
98+
this.metadata = metadata;
99+
this.mmap = mmap;
100+
}
101+
102+
public String getDescription() {
103+
return description;
104+
}
105+
106+
public String getExtension() {
107+
return extension;
108+
}
109+
110+
public boolean isMetadata() {
111+
return metadata;
112+
}
113+
114+
public boolean shouldMmap() {
115+
return mmap;
116+
}
117+
118+
private static final Map<String, LuceneFilesExtensions> extensions;
119+
static {
120+
final Map<String, LuceneFilesExtensions> map = new HashMap<>(values().length);
121+
for (LuceneFilesExtensions extension : values()) {
122+
map.put(extension.extension, extension);
123+
}
124+
extensions = Collections.unmodifiableMap(map);
125+
}
126+
127+
@Nullable
128+
public static LuceneFilesExtensions fromExtension(String ext) {
129+
if (ext != null && ext.isEmpty() == false) {
130+
final LuceneFilesExtensions extension = extensions.get(ext);
131+
assert extension != null: "unknown Lucene file extension [" + ext + ']';
132+
return extension;
133+
}
134+
return null;
135+
}
136+
}

server/src/test/java/org/elasticsearch/index/engine/SegmentsStatsTests.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.lucene.index.NoMergePolicy;
2323
import org.apache.lucene.index.Term;
2424
import org.apache.lucene.store.Directory;
25+
import org.elasticsearch.index.store.LuceneFilesExtensions;
2526
import org.elasticsearch.test.ESTestCase;
2627

2728
public class SegmentsStatsTests extends ESTestCase {
@@ -58,7 +59,7 @@ public void testFileExtensionDescriptions() throws Exception {
5859
}
5960
if (extension != null) {
6061
assertNotNull("extension [" + extension + "] was not contained in the known segment stats files",
61-
SegmentsStats.FILE_DESCRIPTIONS.get(extension));
62+
LuceneFilesExtensions.fromExtension(extension));
6263
}
6364
}
6465
}

server/src/test/java/org/elasticsearch/index/store/FsDirectoryFactoryTests.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public void testPreload() throws IOException {
4242
doTestPreload("*");
4343
Settings build = Settings.builder()
4444
.put(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.HYBRIDFS.name().toLowerCase(Locale.ROOT))
45-
.putList(IndexModule.INDEX_STORE_PRE_LOAD_SETTING.getKey(), "dvd", "bar")
45+
.putList(IndexModule.INDEX_STORE_PRE_LOAD_SETTING.getKey(), "dvd", "tmp")
4646
.build();
4747
try (Directory directory = newDirectory(build)) {
4848
assertTrue(FsDirectoryFactory.isHybridFs(directory));
@@ -56,12 +56,12 @@ public void testPreload() throws IOException {
5656
assertTrue(hybridDirectory.useDelegate("foo.kdd", newIOContext(random())));
5757
assertTrue(hybridDirectory.useDelegate("foo.kdi", newIOContext(random())));
5858
assertFalse(hybridDirectory.useDelegate("foo.kdi", Store.READONCE_CHECKSUM));
59-
assertFalse(hybridDirectory.useDelegate("foo.bar", newIOContext(random())));
59+
assertFalse(hybridDirectory.useDelegate("foo.tmp", newIOContext(random())));
6060
MMapDirectory delegate = hybridDirectory.getDelegate();
6161
assertThat(delegate, Matchers.instanceOf(FsDirectoryFactory.PreLoadMMapDirectory.class));
6262
FsDirectoryFactory.PreLoadMMapDirectory preLoadMMapDirectory = (FsDirectoryFactory.PreLoadMMapDirectory) delegate;
6363
assertTrue(preLoadMMapDirectory.useDelegate("foo.dvd"));
64-
assertTrue(preLoadMMapDirectory.useDelegate("foo.bar"));
64+
assertTrue(preLoadMMapDirectory.useDelegate("foo.tmp"));
6565
}
6666
}
6767

@@ -98,12 +98,12 @@ private void doTestPreload(String...preload) throws IOException {
9898
assertFalse(preLoadMMapDirectory.useDelegate("XXX"));
9999
assertFalse(preLoadMMapDirectory.getPreload());
100100
preLoadMMapDirectory.close();
101-
expectThrows(AlreadyClosedException.class, () -> preLoadMMapDirectory.getDelegate().openInput("foo.bar",
101+
expectThrows(AlreadyClosedException.class, () -> preLoadMMapDirectory.getDelegate().openInput("foo.tmp",
102102
IOContext.DEFAULT));
103103
}
104104
}
105105
expectThrows(AlreadyClosedException.class, () -> directory.openInput(randomBoolean() && preload.length != 0 ?
106-
"foo." + preload[0] : "foo.bar", IOContext.DEFAULT));
106+
"foo." + preload[0] : "foo.tmp", IOContext.DEFAULT));
107107
}
108108

109109
public void testStoreDirectory() throws IOException {

test/framework/src/main/java/org/elasticsearch/common/lucene/store/ESIndexInputTestCase.java

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import org.elasticsearch.common.util.concurrent.EsExecutors;
1616
import org.elasticsearch.common.util.concurrent.EsThreadPoolExecutor;
1717
import org.elasticsearch.common.util.concurrent.ThreadContext;
18+
import org.elasticsearch.index.store.LuceneFilesExtensions;
1819
import org.elasticsearch.test.ESTestCase;
1920
import org.junit.AfterClass;
2021
import org.junit.BeforeClass;
@@ -196,33 +197,6 @@ private String randomUniqueSliceName() {
196197
}
197198

198199
protected static String randomFileExtension() {
199-
return randomFrom(
200-
".cfe",
201-
".cfs",
202-
".dii",
203-
".dim",
204-
".doc",
205-
".dvd",
206-
".dvm",
207-
".fdt",
208-
".fdx",
209-
".fdm",
210-
".fnm",
211-
".kdd",
212-
".kdi",
213-
".kdm",
214-
".liv",
215-
".nvd",
216-
".nvm",
217-
".pay",
218-
".pos",
219-
".tim",
220-
".tip",
221-
".tmd",
222-
".tvd",
223-
".tvx",
224-
".vec",
225-
".vem"
226-
);
200+
return '.' + randomFrom(LuceneFilesExtensions.values()).getExtension();
227201
}
228202
}

0 commit comments

Comments
 (0)