Skip to content

Centralize Lucene files extensions in one place #71416

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.xcontent.ToXContentFragment;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.store.LuceneFilesExtensions;

import java.io.IOException;

Expand All @@ -35,40 +36,6 @@ public class SegmentsStats implements Writeable, ToXContentFragment {
private long bitsetMemoryInBytes;
private ImmutableOpenMap<String, Long> fileSizes = ImmutableOpenMap.of();

/*
* A map to provide a best-effort approach describing Lucene index files.
*
* Ideally this should be in sync to what the current version of Lucene is using, but it's harmless to leave extensions out,
* they'll just miss a proper description in the stats
*/
static final ImmutableOpenMap<String, String> FILE_DESCRIPTIONS = ImmutableOpenMap.<String, String>builder()
.fPut("si", "Segment Info")
.fPut("fnm", "Fields")
.fPut("fdm", "Field Metadata")
.fPut("fdx", "Field Index")
.fPut("fdt", "Field Data")
.fPut("tmd", "Term Dictionary Metadata")
.fPut("tim", "Term Dictionary")
.fPut("tip", "Term Index")
.fPut("doc", "Frequencies")
.fPut("pos", "Positions")
.fPut("pay", "Payloads")
.fPut("nvd", "Norms")
.fPut("nvm", "Norms metadata")
.fPut("kdm", "Points Metadata")
.fPut("kdi", "Points Index")
.fPut("kdm", "Points Metadata")
.fPut("kdi", "Points Index") // old extension
.fPut("kdd", "Points") // old extension
.fPut("dvd", "DocValues")
.fPut("dvm", "DocValues Metadata")
.fPut("tvm", "Term Vector Metadata")
.fPut("tvx", "Term Vector Index")
.fPut("tvd", "Term Vector Documents")
.fPut("tvf", "Term Vector Fields")
.fPut("liv", "Live Documents")
.build();

public SegmentsStats() {}

public SegmentsStats(StreamInput in) throws IOException {
Expand Down Expand Up @@ -321,7 +288,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
for (ObjectObjectCursor<String, Long> entry : fileSizes) {
builder.startObject(entry.key);
builder.humanReadableField(Fields.SIZE_IN_BYTES, Fields.SIZE, new ByteSizeValue(entry.value));
builder.field(Fields.DESCRIPTION, FILE_DESCRIPTIONS.getOrDefault(entry.key, "Others"));
LuceneFilesExtensions extension = LuceneFilesExtensions.fromExtension(entry.key);
builder.field(Fields.DESCRIPTION, extension != null ? extension.getDescription() : "Others");
builder.endObject();
}
builder.endObject();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public Directory newDirectory(IndexSettings indexSettings, ShardPath path) throw

protected Directory newFSDirectory(Path location, LockFactory lockFactory, IndexSettings indexSettings) throws IOException {
final String storeType =
indexSettings.getSettings().get(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.FS.getSettingsKey());
indexSettings.getSettings().get(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.FS.getSettingsKey());
IndexModule.Type type;
if (IndexModule.Type.FS.match(storeType)) {
type = IndexModule.defaultStoreType(IndexModule.NODE_STORE_ALLOW_MMAP.get(indexSettings.getNodeSettings()));
Expand Down Expand Up @@ -89,7 +89,7 @@ protected Directory newFSDirectory(Path location, LockFactory lockFactory, Index
}

public static MMapDirectory setPreload(MMapDirectory mMapDirectory, LockFactory lockFactory,
Set<String> preLoadExtensions) throws IOException {
Set<String> preLoadExtensions) throws IOException {
assert mMapDirectory.getPreload() == false;
if (preLoadExtensions.isEmpty() == false) {
if (preLoadExtensions.contains("*")) {
Expand Down Expand Up @@ -145,35 +145,14 @@ boolean useDelegate(String name, IOContext ioContext) {
return false;
}

String extension = FileSwitchDirectory.getExtension(name);
switch(extension) {
// Norms, doc values and term dictionaries are typically performance-sensitive and hot in the page
// cache, so we use mmap, which provides better performance.
case "nvd":
case "dvd":
case "tim":
// We want to open the terms index and KD-tree index off-heap to save memory, but this only performs
// well if using mmap.
case "tip":
// dim files only apply up to lucene 8.x indices. It can be removed once we are in lucene 10
case "dim":
case "kdd":
case "kdi":
// Compound files are tricky because they store all the information for the segment. Benchmarks
// suggested that not mapping them hurts performance.
case "cfs":
// MMapDirectory has special logic to read long[] arrays in little-endian order that helps speed
// up the decoding of postings. The same logic applies to positions (.pos) of offsets (.pay) but we
// are not mmaping them as queries that leverage positions are more costly and the decoding of postings
// tends to be less a bottleneck.
case "doc":
return true;
final LuceneFilesExtensions extension = LuceneFilesExtensions.fromExtension(FileSwitchDirectory.getExtension(name));
if (extension == null || extension.shouldMmap() == false) {
// Other files are either less performance-sensitive (e.g. stored field index, norms metadata)
// or are large and have a random access pattern and mmap leads to page cache trashing
// (e.g. stored fields and term vectors).
default:
return false;
return false;
}
return true;
}

MMapDirectory getDelegate() {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.index.store;

import org.elasticsearch.common.Nullable;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;

public enum LuceneFilesExtensions {

CFE("cfe", "Compound Files Entries", true, false),
// Compound files are tricky because they store all the information for the segment. Benchmarks
// suggested that not mapping them hurts performance.
CFS("cfs", "Compound Files", false, true),
CMP("cmp", "Completion Index", true, false),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Introduced in this PR, this file will be treated as a metadata one by searchable snapshots.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it mean that this file will be downloaded eagerly even if the query doesn't need it? This doesn't sound like the right trade-off to me?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it means that this file will be cached a bit differently to speed up searchable snapshots shards recoveries. For example if this file is smaller or equal to 64KB it will be fully cached into a doc in the .snapshot-blob-cache system index that will be later retrieved when opening the Directory.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah sorry I was confused in my previous comment, I had missed that this file was read eagerly when opening an index.

DII("dii", "Points Index", false, false),
// dim files only apply up to lucene 8.x indices. It can be removed once we are in lucene 10
DIM("dim", "Points", false, true),
// MMapDirectory has special logic to read long[] arrays in little-endian order that helps speed
// up the decoding of postings. The same logic applies to positions (.pos) of offsets (.pay) but we
// are not mmaping them as queries that leverage positions are more costly and the decoding of postings
// tends to be less a bottleneck.
DOC("doc", "Frequencies", false, true),
// Doc values are typically performance-sensitive and hot in the page
// cache, so we use mmap, which provides better performance.
DVD("dvd", "DocValues", false, true),
DVM("dvm", "DocValues Metadata", true, false),
FDM("fdm", "Field Metadata", true, false),
FDT("fdt", "Field Data", false, false),
FDX("fdx", "Field Index", false, false),
FNM("fnm", "Fields", true, false),
// old extension
KDD("kdd", "Points", false, true),
// old extension
KDI("kdi", "Points Index", false, true),
// Lucene 8.6 point format metadata file
KDM("kdm", "Points Metadata", true, false),
LIV("liv", "Live Documents", false, false),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the current implementation of live docs, they are fully read when opening an index, should we treat them as metadata?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC we did not flag liv files as metadata since we were expecting most indices to use soft-deletes and also because in my mind liv files can become large (?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.liv files can indeed be quite large, but so can be .cmp files.

LKP("lkp", "Completion Dictionary", false, false),
// Norms are typically performance-sensitive and hot in the page
// cache, so we use mmap, which provides better performance.
NVD("nvd", "Norms", false, true),
NVM("nvm", "Norms Metadata", true, false),
PAY("pay", "Payloads", false, false),
POS("pos", "Positions", false, false),
SI("si", "Segment Info", true, false),
// Term dictionaries are typically performance-sensitive and hot in the page
// cache, so we use mmap, which provides better performance.
TIM("tim", "Term Dictionary", false, true),
// We want to open the terms index and KD-tree index off-heap to save memory, but this only performs
// well if using mmap.
TIP("tip", "Term Index", false, true),
// Lucene 8.6 terms metadata file
TMD("tmd", "Term Dictionary Metadata", true, false),
// Temporary Lucene file
TMP("tmp", "Temporary File", false, false),
TVD("tvd", "Term Vector Documents", false, false),
TVF("tvf", "Term Vector Fields", false, false),
TVM("tvm", "Term Vector Metadata", true, false),
TVX("tvx", "Term Vector Index", false, false),
VEC("vec", "Vector Data", false, false),
// Lucene 9.0 indexed vectors metadata
VEM("vem","Vector Metadata", true, false);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this PR the Segments Stats API uses this list of extensions instead of a specific, more limited one. If this PR is merged then the API will return more file types which I think is good.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

++ that's great


/**
* Lucene file's extension.
*/
private final String extension;

/**
* Short description of the Lucene file
*/
private final String description;

/**
* Some Lucene files should be memory-mapped when applicable.
*/
private final boolean mmap;

/**
* Some Lucene files are considered as "metadata" files and should therefore be fully cached when applicable. Those files are usually
* fully read by Lucene when a Directory is opened. For non-metadata files Lucene usually only reads the header and footer checksums.
*/
private final boolean metadata;

LuceneFilesExtensions(String extension, String description, boolean metadata, boolean mmap) {
this.description = Objects.requireNonNull(description);
this.extension = Objects.requireNonNull(extension);
this.metadata = metadata;
this.mmap = mmap;
}

public String getDescription() {
return description;
}

public String getExtension() {
return extension;
}

public boolean isMetadata() {
return metadata;
}

public boolean shouldMmap() {
return mmap;
}

private static final Map<String, LuceneFilesExtensions> extensions;
static {
final Map<String, LuceneFilesExtensions> map = new HashMap<>(values().length);
for (LuceneFilesExtensions extension : values()) {
map.put(extension.extension, extension);
}
extensions = Collections.unmodifiableMap(map);
}

@Nullable
public static LuceneFilesExtensions fromExtension(String ext) {
if (ext != null && ext.isEmpty() == false) {
final LuceneFilesExtensions extension = extensions.get(ext);
assert extension != null: "unknown Lucene file extension [" + ext + ']';
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a best effort to catch any missing extension

return extension;
}
return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.elasticsearch.index.store.LuceneFilesExtensions;
import org.elasticsearch.test.ESTestCase;

public class SegmentsStatsTests extends ESTestCase {
Expand Down Expand Up @@ -58,7 +59,7 @@ public void testFileExtensionDescriptions() throws Exception {
}
if (extension != null) {
assertNotNull("extension [" + extension + "] was not contained in the known segment stats files",
SegmentsStats.FILE_DESCRIPTIONS.get(extension));
LuceneFilesExtensions.fromExtension(extension));
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public void testPreload() throws IOException {
doTestPreload("*");
Settings build = Settings.builder()
.put(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.HYBRIDFS.name().toLowerCase(Locale.ROOT))
.putList(IndexModule.INDEX_STORE_PRE_LOAD_SETTING.getKey(), "dvd", "bar")
.putList(IndexModule.INDEX_STORE_PRE_LOAD_SETTING.getKey(), "dvd", "tmp")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why way this change necessary? bar is not a valid extension, just as tmp?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bar is not a valid extension while tmp is used by Lucene for temporary files. I've added

// Temporary Lucene file
TMP("tmp", "Temporary File", false, false),

for this purpose

.build();
try (Directory directory = newDirectory(build)) {
assertTrue(FsDirectoryFactory.isHybridFs(directory));
Expand All @@ -56,12 +56,12 @@ public void testPreload() throws IOException {
assertTrue(hybridDirectory.useDelegate("foo.kdd", newIOContext(random())));
assertTrue(hybridDirectory.useDelegate("foo.kdi", newIOContext(random())));
assertFalse(hybridDirectory.useDelegate("foo.kdi", Store.READONCE_CHECKSUM));
assertFalse(hybridDirectory.useDelegate("foo.bar", newIOContext(random())));
assertFalse(hybridDirectory.useDelegate("foo.tmp", newIOContext(random())));
MMapDirectory delegate = hybridDirectory.getDelegate();
assertThat(delegate, Matchers.instanceOf(FsDirectoryFactory.PreLoadMMapDirectory.class));
FsDirectoryFactory.PreLoadMMapDirectory preLoadMMapDirectory = (FsDirectoryFactory.PreLoadMMapDirectory) delegate;
assertTrue(preLoadMMapDirectory.useDelegate("foo.dvd"));
assertTrue(preLoadMMapDirectory.useDelegate("foo.bar"));
assertTrue(preLoadMMapDirectory.useDelegate("foo.tmp"));
}
}

Expand Down Expand Up @@ -98,12 +98,12 @@ private void doTestPreload(String...preload) throws IOException {
assertFalse(preLoadMMapDirectory.useDelegate("XXX"));
assertFalse(preLoadMMapDirectory.getPreload());
preLoadMMapDirectory.close();
expectThrows(AlreadyClosedException.class, () -> preLoadMMapDirectory.getDelegate().openInput("foo.bar",
expectThrows(AlreadyClosedException.class, () -> preLoadMMapDirectory.getDelegate().openInput("foo.tmp",
IOContext.DEFAULT));
}
}
expectThrows(AlreadyClosedException.class, () -> directory.openInput(randomBoolean() && preload.length != 0 ?
"foo." + preload[0] : "foo.bar", IOContext.DEFAULT));
"foo." + preload[0] : "foo.tmp", IOContext.DEFAULT));
}

public void testStoreDirectory() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import org.elasticsearch.common.util.concurrent.EsExecutors;
import org.elasticsearch.common.util.concurrent.EsThreadPoolExecutor;
import org.elasticsearch.common.util.concurrent.ThreadContext;
import org.elasticsearch.index.store.LuceneFilesExtensions;
import org.elasticsearch.test.ESTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
Expand Down Expand Up @@ -196,33 +197,6 @@ private String randomUniqueSliceName() {
}

protected static String randomFileExtension() {
return randomFrom(
".cfe",
".cfs",
".dii",
".dim",
".doc",
".dvd",
".dvm",
".fdt",
".fdx",
".fdm",
".fnm",
".kdd",
".kdi",
".kdm",
".liv",
".nvd",
".nvm",
".pay",
".pos",
".tim",
".tip",
".tmd",
".tvd",
".tvx",
".vec",
".vem"
);
return '.' + randomFrom(LuceneFilesExtensions.values()).getExtension();
}
}
Loading