Skip to content

Commit 0251ff4

Browse files
committed
Die with dignity while merging
If an out of memory error is thrown while merging, today we quietly rewrap it into a merge exception and the out of memory error is lost. Instead, we need to rethrow out of memory errors, and in fact any fatal error here, and let those go uncaught so that the node is torn down. This commit causes this to be the case. Relates #27265
1 parent fc09a09 commit 0251ff4

File tree

8 files changed

+592
-332
lines changed

8 files changed

+592
-332
lines changed

core/src/main/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandler.java

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import org.apache.logging.log4j.Logger;
2323
import org.apache.logging.log4j.message.ParameterizedMessage;
24-
import org.apache.lucene.index.MergePolicy;
2524
import org.elasticsearch.common.SuppressForbidden;
2625
import org.elasticsearch.common.logging.Loggers;
2726

@@ -68,11 +67,7 @@ public void uncaughtException(Thread t, Throwable e) {
6867

6968
// visible for testing
7069
static boolean isFatalUncaught(Throwable e) {
71-
return isFatalCause(e) || (e instanceof MergePolicy.MergeException && isFatalCause(e.getCause()));
72-
}
73-
74-
private static boolean isFatalCause(Throwable cause) {
75-
return cause instanceof Error;
70+
return e instanceof Error;
7671
}
7772

7873
// visible for testing

core/src/main/java/org/elasticsearch/index/engine/InternalEngine.java

Lines changed: 62 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,23 +1224,15 @@ public IndexCommit acquireIndexCommit(final boolean flushFirst) throws EngineExc
12241224
}
12251225
}
12261226

1227-
@SuppressWarnings("finally")
12281227
private boolean failOnTragicEvent(AlreadyClosedException ex) {
12291228
final boolean engineFailed;
12301229
// if we are already closed due to some tragic exception
12311230
// we need to fail the engine. it might have already been failed before
12321231
// but we are double-checking it's failed and closed
12331232
if (indexWriter.isOpen() == false && indexWriter.getTragicException() != null) {
1234-
if (indexWriter.getTragicException() instanceof Error) {
1235-
try {
1236-
logger.error("tragic event in index writer", ex);
1237-
} finally {
1238-
throw (Error) indexWriter.getTragicException();
1239-
}
1240-
} else {
1241-
failEngine("already closed by tragic event on the index writer", (Exception) indexWriter.getTragicException());
1242-
engineFailed = true;
1243-
}
1233+
maybeDie("tragic event in index writer", indexWriter.getTragicException());
1234+
failEngine("already closed by tragic event on the index writer", (Exception) indexWriter.getTragicException());
1235+
engineFailed = true;
12441236
} else if (translog.isOpen() == false && translog.getTragicException() != null) {
12451237
failEngine("already closed by tragic event on the translog", translog.getTragicException());
12461238
engineFailed = true;
@@ -1381,34 +1373,43 @@ private long loadCurrentVersionFromIndex(Term uid) throws IOException {
13811373
// pkg-private for testing
13821374
IndexWriter createWriter(boolean create) throws IOException {
13831375
try {
1384-
final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer());
1385-
iwc.setCommitOnClose(false); // we by default don't commit on close
1386-
iwc.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
1387-
iwc.setIndexDeletionPolicy(deletionPolicy);
1388-
// with tests.verbose, lucene sets this up: plumb to align with filesystem stream
1389-
boolean verbose = false;
1390-
try {
1391-
verbose = Boolean.parseBoolean(System.getProperty("tests.verbose"));
1392-
} catch (Exception ignore) {
1393-
}
1394-
iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger));
1395-
iwc.setMergeScheduler(mergeScheduler);
1396-
MergePolicy mergePolicy = config().getMergePolicy();
1397-
// Give us the opportunity to upgrade old segments while performing
1398-
// background merges
1399-
mergePolicy = new ElasticsearchMergePolicy(mergePolicy);
1400-
iwc.setMergePolicy(mergePolicy);
1401-
iwc.setSimilarity(engineConfig.getSimilarity());
1402-
iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().getMbFrac());
1403-
iwc.setCodec(engineConfig.getCodec());
1404-
iwc.setUseCompoundFile(true); // always use compound on flush - reduces # of file-handles on refresh
1405-
return new IndexWriter(store.directory(), iwc);
1376+
final IndexWriterConfig iwc = getIndexWriterConfig(create);
1377+
return createWriter(store.directory(), iwc);
14061378
} catch (LockObtainFailedException ex) {
14071379
logger.warn("could not lock IndexWriter", ex);
14081380
throw ex;
14091381
}
14101382
}
14111383

1384+
IndexWriter createWriter(Directory directory, IndexWriterConfig iwc) throws IOException {
1385+
return new IndexWriter(directory, iwc);
1386+
}
1387+
1388+
private IndexWriterConfig getIndexWriterConfig(boolean create) {
1389+
final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer());
1390+
iwc.setCommitOnClose(false); // we by default don't commit on close
1391+
iwc.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
1392+
iwc.setIndexDeletionPolicy(deletionPolicy);
1393+
// with tests.verbose, lucene sets this up: plumb to align with filesystem stream
1394+
boolean verbose = false;
1395+
try {
1396+
verbose = Boolean.parseBoolean(System.getProperty("tests.verbose"));
1397+
} catch (Exception ignore) {
1398+
}
1399+
iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger));
1400+
iwc.setMergeScheduler(mergeScheduler);
1401+
MergePolicy mergePolicy = config().getMergePolicy();
1402+
// Give us the opportunity to upgrade old segments while performing
1403+
// background merges
1404+
mergePolicy = new ElasticsearchMergePolicy(mergePolicy);
1405+
iwc.setMergePolicy(mergePolicy);
1406+
iwc.setSimilarity(engineConfig.getSimilarity());
1407+
iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().getMbFrac());
1408+
iwc.setCodec(engineConfig.getCodec());
1409+
iwc.setUseCompoundFile(true); // always use compound on flush - reduces # of file-handles on refresh
1410+
return iwc;
1411+
}
1412+
14121413
/** Extended SearcherFactory that warms the segments if needed when acquiring a new searcher */
14131414
static final class SearchFactory extends EngineSearcherFactory {
14141415
private final Engine.Warmer warmer;
@@ -1539,7 +1540,6 @@ protected void doRun() throws Exception {
15391540

15401541
@Override
15411542
protected void handleMergeException(final Directory dir, final Throwable exc) {
1542-
logger.error("failed to merge", exc);
15431543
engineConfig.getThreadPool().generic().execute(new AbstractRunnable() {
15441544
@Override
15451545
public void onFailure(Exception e) {
@@ -1548,13 +1548,39 @@ public void onFailure(Exception e) {
15481548

15491549
@Override
15501550
protected void doRun() throws Exception {
1551-
MergePolicy.MergeException e = new MergePolicy.MergeException(exc, dir);
1552-
failEngine("merge failed", e);
1551+
/*
1552+
* We do this on another thread rather than the merge thread that we are initially called on so that we have complete
1553+
* confidence that the call stack does not contain catch statements that would cause the error that might be thrown
1554+
* here from being caught and never reaching the uncaught exception handler.
1555+
*/
1556+
maybeDie("fatal error while merging", exc);
1557+
logger.error("failed to merge", exc);
1558+
failEngine("merge failed", new MergePolicy.MergeException(exc, dir));
15531559
}
15541560
});
15551561
}
15561562
}
15571563

1564+
/**
1565+
* If the specified throwable is a fatal error, this throwable will be thrown. Callers should ensure that there are no catch statements
1566+
* that would catch an error in the stack as the fatal error here should go uncaught and be handled by the uncaught exception handler
1567+
* that we install during bootstrap. If the specified throwable is indeed a fatal error, the specified message will attempt to be logged
1568+
* before throwing the fatal error. If the specified throwable is not a fatal error, this method is a no-op.
1569+
*
1570+
* @param maybeMessage the message to maybe log
1571+
* @param maybeFatal the throwable that is maybe fatal
1572+
*/
1573+
@SuppressWarnings("finally")
1574+
private void maybeDie(final String maybeMessage, final Throwable maybeFatal) {
1575+
if (maybeFatal instanceof Error) {
1576+
try {
1577+
logger.error(maybeMessage, maybeFatal);
1578+
} finally {
1579+
throw (Error) maybeFatal;
1580+
}
1581+
}
1582+
}
1583+
15581584
private void commitIndexWriter(IndexWriter writer, Translog translog, String syncId) throws IOException {
15591585
ensureCanFlush();
15601586
try {

core/src/test/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandlerTests.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
package org.elasticsearch.bootstrap;
2121

22-
import org.apache.lucene.index.MergePolicy;
2322
import org.elasticsearch.test.ESTestCase;
2423
import org.junit.Before;
2524

@@ -131,7 +130,6 @@ void onNonFatalUncaught(String threadName, Throwable t) {
131130
}
132131

133132
public void testIsFatalCause() {
134-
assertFatal(new MergePolicy.MergeException(new OutOfMemoryError(), null));
135133
assertFatal(new OutOfMemoryError());
136134
assertFatal(new StackOverflowError());
137135
assertFatal(new InternalError());

0 commit comments

Comments
 (0)