-
Notifications
You must be signed in to change notification settings - Fork 25.2k
Centralize Lucene files extensions in one place #71416
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
be21bf5
a991bf9
939c553
66dec29
9bdcae9
7ff0eea
b4eb5b3
11d4483
bd17456
b3fc0ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License | ||
* 2.0 and the Server Side Public License, v 1; you may not use this file except | ||
* in compliance with, at your election, the Elastic License 2.0 or the Server | ||
* Side Public License, v 1. | ||
*/ | ||
|
||
package org.elasticsearch.index.store; | ||
|
||
import org.elasticsearch.common.Nullable; | ||
|
||
import java.util.Collections; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
|
||
public enum LuceneFilesExtensions { | ||
|
||
CFE("cfe", "Compound Files Entries", true, false), | ||
// Compound files are tricky because they store all the information for the segment. Benchmarks | ||
// suggested that not mapping them hurts performance. | ||
CFS("cfs", "Compound Files", false, true), | ||
CMP("cmp", "Completion Index", true, false), | ||
DII("dii", "Points Index", false, false), | ||
// dim files only apply up to lucene 8.x indices. It can be removed once we are in lucene 10 | ||
DIM("dim", "Points", false, true), | ||
// MMapDirectory has special logic to read long[] arrays in little-endian order that helps speed | ||
// up the decoding of postings. The same logic applies to positions (.pos) of offsets (.pay) but we | ||
// are not mmaping them as queries that leverage positions are more costly and the decoding of postings | ||
// tends to be less a bottleneck. | ||
DOC("doc", "Frequencies", false, true), | ||
// Doc values are typically performance-sensitive and hot in the page | ||
// cache, so we use mmap, which provides better performance. | ||
DVD("dvd", "DocValues", false, true), | ||
DVM("dvm", "DocValues Metadata", true, false), | ||
FDM("fdm", "Field Metadata", true, false), | ||
FDT("fdt", "Field Data", false, false), | ||
FDX("fdx", "Field Index", false, false), | ||
FNM("fnm", "Fields", true, false), | ||
// old extension | ||
KDD("kdd", "Points", false, true), | ||
// old extension | ||
KDI("kdi", "Points Index", false, true), | ||
// Lucene 8.6 point format metadata file | ||
KDM("kdm", "Points Metadata", true, false), | ||
LIV("liv", "Live Documents", false, false), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With the current implementation of live docs, they are fully read when opening an index, should we treat them as metadata? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIRC we did not flag There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
LKP("lkp", "Completion Dictionary", false, false), | ||
// Norms are typically performance-sensitive and hot in the page | ||
// cache, so we use mmap, which provides better performance. | ||
NVD("nvd", "Norms", false, true), | ||
NVM("nvm", "Norms Metadata", true, false), | ||
PAY("pay", "Payloads", false, false), | ||
POS("pos", "Positions", false, false), | ||
SI("si", "Segment Info", true, false), | ||
// Term dictionaries are typically performance-sensitive and hot in the page | ||
// cache, so we use mmap, which provides better performance. | ||
TIM("tim", "Term Dictionary", false, true), | ||
// We want to open the terms index and KD-tree index off-heap to save memory, but this only performs | ||
// well if using mmap. | ||
TIP("tip", "Term Index", false, true), | ||
// Lucene 8.6 terms metadata file | ||
TMD("tmd", "Term Dictionary Metadata", true, false), | ||
// Temporary Lucene file | ||
TMP("tmp", "Temporary File", false, false), | ||
TVD("tvd", "Term Vector Documents", false, false), | ||
TVF("tvf", "Term Vector Fields", false, false), | ||
TVM("tvm", "Term Vector Metadata", true, false), | ||
TVX("tvx", "Term Vector Index", false, false), | ||
VEC("vec", "Vector Data", false, false), | ||
// Lucene 9.0 indexed vectors metadata | ||
VEM("vem","Vector Metadata", true, false); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this PR the Segments Stats API uses this list of extensions instead of a specific, more limited one. If this PR is merged then the API will return more file types which I think is good. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ++ that's great |
||
|
||
/** | ||
* Lucene file's extension. | ||
*/ | ||
private final String extension; | ||
|
||
/** | ||
* Short description of the Lucene file | ||
*/ | ||
private final String description; | ||
|
||
/** | ||
* Some Lucene files should be memory-mapped when applicable. | ||
*/ | ||
private final boolean mmap; | ||
|
||
/** | ||
* Some Lucene files are considered as "metadata" files and should therefore be fully cached when applicable. Those files are usually | ||
* fully read by Lucene when a Directory is opened. For non-metadata files Lucene usually only reads the header and footer checksums. | ||
*/ | ||
private final boolean metadata; | ||
|
||
LuceneFilesExtensions(String extension, String description, boolean metadata, boolean mmap) { | ||
this.description = Objects.requireNonNull(description); | ||
this.extension = Objects.requireNonNull(extension); | ||
this.metadata = metadata; | ||
this.mmap = mmap; | ||
} | ||
|
||
public String getDescription() { | ||
return description; | ||
} | ||
|
||
public String getExtension() { | ||
return extension; | ||
} | ||
|
||
public boolean isMetadata() { | ||
return metadata; | ||
} | ||
|
||
public boolean shouldMmap() { | ||
return mmap; | ||
} | ||
|
||
private static final Map<String, LuceneFilesExtensions> extensions; | ||
static { | ||
final Map<String, LuceneFilesExtensions> map = new HashMap<>(values().length); | ||
for (LuceneFilesExtensions extension : values()) { | ||
map.put(extension.extension, extension); | ||
} | ||
extensions = Collections.unmodifiableMap(map); | ||
} | ||
|
||
@Nullable | ||
public static LuceneFilesExtensions fromExtension(String ext) { | ||
if (ext != null && ext.isEmpty() == false) { | ||
final LuceneFilesExtensions extension = extensions.get(ext); | ||
assert extension != null: "unknown Lucene file extension [" + ext + ']'; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a best effort to catch any missing extension |
||
return extension; | ||
} | ||
return null; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,7 +42,7 @@ public void testPreload() throws IOException { | |
doTestPreload("*"); | ||
Settings build = Settings.builder() | ||
.put(IndexModule.INDEX_STORE_TYPE_SETTING.getKey(), IndexModule.Type.HYBRIDFS.name().toLowerCase(Locale.ROOT)) | ||
.putList(IndexModule.INDEX_STORE_PRE_LOAD_SETTING.getKey(), "dvd", "bar") | ||
.putList(IndexModule.INDEX_STORE_PRE_LOAD_SETTING.getKey(), "dvd", "tmp") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why way this change necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
for this purpose |
||
.build(); | ||
try (Directory directory = newDirectory(build)) { | ||
assertTrue(FsDirectoryFactory.isHybridFs(directory)); | ||
|
@@ -56,12 +56,12 @@ public void testPreload() throws IOException { | |
assertTrue(hybridDirectory.useDelegate("foo.kdd", newIOContext(random()))); | ||
assertTrue(hybridDirectory.useDelegate("foo.kdi", newIOContext(random()))); | ||
assertFalse(hybridDirectory.useDelegate("foo.kdi", Store.READONCE_CHECKSUM)); | ||
assertFalse(hybridDirectory.useDelegate("foo.bar", newIOContext(random()))); | ||
assertFalse(hybridDirectory.useDelegate("foo.tmp", newIOContext(random()))); | ||
MMapDirectory delegate = hybridDirectory.getDelegate(); | ||
assertThat(delegate, Matchers.instanceOf(FsDirectoryFactory.PreLoadMMapDirectory.class)); | ||
FsDirectoryFactory.PreLoadMMapDirectory preLoadMMapDirectory = (FsDirectoryFactory.PreLoadMMapDirectory) delegate; | ||
assertTrue(preLoadMMapDirectory.useDelegate("foo.dvd")); | ||
assertTrue(preLoadMMapDirectory.useDelegate("foo.bar")); | ||
assertTrue(preLoadMMapDirectory.useDelegate("foo.tmp")); | ||
} | ||
} | ||
|
||
|
@@ -98,12 +98,12 @@ private void doTestPreload(String...preload) throws IOException { | |
assertFalse(preLoadMMapDirectory.useDelegate("XXX")); | ||
assertFalse(preLoadMMapDirectory.getPreload()); | ||
preLoadMMapDirectory.close(); | ||
expectThrows(AlreadyClosedException.class, () -> preLoadMMapDirectory.getDelegate().openInput("foo.bar", | ||
expectThrows(AlreadyClosedException.class, () -> preLoadMMapDirectory.getDelegate().openInput("foo.tmp", | ||
IOContext.DEFAULT)); | ||
} | ||
} | ||
expectThrows(AlreadyClosedException.class, () -> directory.openInput(randomBoolean() && preload.length != 0 ? | ||
"foo." + preload[0] : "foo.bar", IOContext.DEFAULT)); | ||
"foo." + preload[0] : "foo.tmp", IOContext.DEFAULT)); | ||
} | ||
|
||
public void testStoreDirectory() throws IOException { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Introduced in this PR, this file will be treated as a metadata one by searchable snapshots.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it mean that this file will be downloaded eagerly even if the query doesn't need it? This doesn't sound like the right trade-off to me?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, it means that this file will be cached a bit differently to speed up searchable snapshots shards recoveries. For example if this file is smaller or equal to 64KB it will be fully cached into a doc in the
.snapshot-blob-cache
system index that will be later retrieved when opening the Directory.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah sorry I was confused in my previous comment, I had missed that this file was read eagerly when opening an index.