@@ -214,7 +214,7 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesCl
214
214
protected volatile IndexShardState state ;
215
215
private volatile long pendingPrimaryTerm ; // see JavaDocs for getPendingPrimaryTerm
216
216
private final Object engineMutex = new Object ();
217
- private final EngineReference currentEngineReference = new EngineReference ();
217
+ private final AtomicReference < Engine > currentEngineReference = new AtomicReference <> ();
218
218
final EngineFactory engineFactory ;
219
219
220
220
private final IndexingOperationListener indexingOperationListeners ;
@@ -1191,20 +1191,23 @@ public Engine.IndexCommitRef acquireSafeIndexCommit() throws EngineException {
1191
1191
* @throws java.nio.file.NoSuchFileException if one or more files referenced by a commit are not present.
1192
1192
*/
1193
1193
public Store .MetadataSnapshot snapshotStoreMetadata () throws IOException {
1194
+ assert Thread .holdsLock (mutex ) == false : "snapshotting store metadata under mutex" ;
1194
1195
Engine .IndexCommitRef indexCommit = null ;
1195
1196
store .incRef ();
1196
1197
try {
1197
- Engine engine ;
1198
1198
synchronized (engineMutex ) {
1199
1199
// if the engine is not running, we can access the store directly, but we need to make sure no one starts
1200
- // the engine on us. If the engine is running, we can get a snapshot via the deletion policy which is initialized.
1201
- // That can be done out of mutex, since the engine can be closed half way.
1202
- engine = getEngineOrNull ();
1203
- if (engine == null ) {
1200
+ // the engine on us. If the engine is running, we can get a snapshot via the deletion policy of the engine.
1201
+ synchronized (mutex ) {
1202
+ final Engine engine = getEngineOrNull ();
1203
+ if (engine != null ) {
1204
+ indexCommit = engine .acquireLastIndexCommit (false );
1205
+ }
1206
+ }
1207
+ if (indexCommit == null ) {
1204
1208
return store .getMetadata (null , true );
1205
1209
}
1206
1210
}
1207
- indexCommit = engine .acquireLastIndexCommit (false );
1208
1211
return store .getMetadata (indexCommit .getIndexCommit ());
1209
1212
} finally {
1210
1213
store .decRef ();
@@ -1318,14 +1321,15 @@ public void close(String reason, boolean flushEngine) throws IOException {
1318
1321
try {
1319
1322
changeState (IndexShardState .CLOSED , reason );
1320
1323
} finally {
1324
+ final Engine engine = this .currentEngineReference .getAndSet (null );
1321
1325
try {
1322
- if (flushEngine ) {
1323
- currentEngineReference .flushAndClose ();
1326
+ if (engine != null && flushEngine ) {
1327
+ engine .flushAndClose ();
1324
1328
}
1325
1329
} finally {
1326
1330
// playing safe here and close the engine even if the above succeeds - close can be called multiple times
1327
1331
// Also closing refreshListeners to prevent us from accumulating any more listeners
1328
- IOUtils .close (currentEngineReference , globalCheckpointListeners , refreshListeners );
1332
+ IOUtils .close (engine , globalCheckpointListeners , refreshListeners );
1329
1333
indexShardOperationPermits .close ();
1330
1334
}
1331
1335
}
@@ -1347,7 +1351,7 @@ public IndexShard postRecovery(String reason)
1347
1351
// we need to refresh again to expose all operations that were index until now. Otherwise
1348
1352
// we may not expose operations that were indexed with a refresh listener that was immediately
1349
1353
// responded to in addRefreshListener.
1350
- refresh ("post_recovery" );
1354
+ getEngine (). refresh ("post_recovery" );
1351
1355
return this ;
1352
1356
}
1353
1357
@@ -1420,7 +1424,9 @@ public long recoverLocallyUpToGlobalCheckpoint() {
1420
1424
getEngine ().recoverFromTranslog (translogRecoveryRunner , globalCheckpoint );
1421
1425
logger .trace ("shard locally recovered up to {}" , getEngine ().getSeqNoStats (globalCheckpoint ));
1422
1426
} finally {
1423
- currentEngineReference .swapReference (null );
1427
+ synchronized (mutex ) {
1428
+ IOUtils .close (currentEngineReference .getAndSet (null ));
1429
+ }
1424
1430
}
1425
1431
} catch (Exception e ) {
1426
1432
logger .debug (new ParameterizedMessage ("failed to recover shard locally up to global checkpoint {}" , globalCheckpoint ), e );
@@ -1579,7 +1585,7 @@ public void openEngineAndSkipTranslogRecovery() throws IOException {
1579
1585
}
1580
1586
1581
1587
private void innerOpenEngineAndTranslog (LongSupplier globalCheckpointSupplier ) throws IOException {
1582
- assert Thread .holdsLock (mutex ) == false : "opening engine under mutex [" + Thread . currentThread () + "] " ;
1588
+ assert Thread .holdsLock (mutex ) == false : "opening engine under mutex" ;
1583
1589
if (state != IndexShardState .RECOVERING ) {
1584
1590
throw new IndexShardNotRecoveringException (shardId , state );
1585
1591
}
@@ -1593,19 +1599,25 @@ private void innerOpenEngineAndTranslog(LongSupplier globalCheckpointSupplier) t
1593
1599
: "expected empty set of retention leases with recovery source [" + recoveryState .getRecoverySource ()
1594
1600
+ "] but got " + getRetentionLeases ();
1595
1601
synchronized (engineMutex ) {
1596
- assert currentEngineReference .get () == null : "engine is running" ;
1597
1602
// we must create a new engine under mutex (see IndexShard#snapshotStoreMetadata).
1598
- Engine newEngine = engineFactory .newReadWriteEngine (config );
1599
- onNewEngine ( newEngine ) ;
1603
+ final Engine newEngine = engineFactory .newReadWriteEngine (config );
1604
+ boolean success = false ;
1600
1605
try {
1601
- currentEngineReference .swapReference (newEngine );
1602
- newEngine = null ;
1606
+ synchronized (mutex ) {
1607
+ verifyNotClosed ();
1608
+ assert currentEngineReference .get () == null : "engine is running" ;
1609
+ onNewEngine (newEngine );
1610
+ currentEngineReference .set (newEngine );
1611
+ // We set active because we are now writing operations to the engine; this way,
1612
+ // if we go idle after some time and become inactive, we still give sync'd flush a chance to run.
1613
+ active .set (true );
1614
+ success = true ;
1615
+ }
1603
1616
} finally {
1604
- IOUtils .close (newEngine );
1617
+ if (success == false ) {
1618
+ newEngine .close ();
1619
+ }
1605
1620
}
1606
- // We set active because we are now writing operations to the engine; this way,
1607
- // if we go idle after some time and become inactive, we still give sync'd flush a chance to run.
1608
- active .set (true );
1609
1621
}
1610
1622
// time elapses after the engine is created above (pulling the config settings) until we set the engine reference, during
1611
1623
// which settings changes could possibly have happened, so here we forcefully push any config changes to the new engine.
@@ -1628,6 +1640,7 @@ private boolean assertSequenceNumbersInCommit() throws IOException {
1628
1640
}
1629
1641
1630
1642
private void onNewEngine (Engine newEngine ) {
1643
+ assert Thread .holdsLock (engineMutex );
1631
1644
refreshListeners .setCurrentRefreshLocationSupplier (newEngine ::getTranslogLastWriteLocation );
1632
1645
}
1633
1646
@@ -1637,7 +1650,7 @@ private void onNewEngine(Engine newEngine) {
1637
1650
public void performRecoveryRestart () throws IOException {
1638
1651
synchronized (mutex ) {
1639
1652
assert refreshListeners .pendingCount () == 0 : "we can't restart with pending listeners" ;
1640
- currentEngineReference .swapReference (null );
1653
+ IOUtils . close ( currentEngineReference .getAndSet (null ) );
1641
1654
resetRecoveryStage ();
1642
1655
}
1643
1656
}
@@ -2671,8 +2684,10 @@ private DocumentMapperForType docMapper(String type) {
2671
2684
private EngineConfig newEngineConfig (LongSupplier globalCheckpointSupplier ) {
2672
2685
final Sort indexSort = indexSortSupplier .get ();
2673
2686
final Engine .Warmer warmer = reader -> {
2674
- assert Thread .holdsLock (mutex ) == false : "warming engine under mutex [" + Thread .currentThread () + "]" ;
2675
- this .warmer .warm (reader );
2687
+ assert Thread .holdsLock (mutex ) == false : "warming engine under mutex" ;
2688
+ if (this .warmer != null ) {
2689
+ this .warmer .warm (reader );
2690
+ }
2676
2691
};
2677
2692
return new EngineConfig (shardId , shardRouting .allocationId ().getId (),
2678
2693
threadPool , indexSettings , warmer , store , indexSettings .getMergePolicy (),
@@ -3303,7 +3318,7 @@ public ParsedDocument newNoopTombstoneDoc(String reason) {
3303
3318
* Rollback the current engine to the safe commit, then replay local translog up to the global checkpoint.
3304
3319
*/
3305
3320
void resetEngineToGlobalCheckpoint () throws IOException {
3306
- assert Thread .holdsLock (mutex ) == false : "resetting engine under mutex [" + Thread . currentThread () + "] " ;
3321
+ assert Thread .holdsLock (engineMutex ) == false : "resetting engine under mutex" ;
3307
3322
assert getActiveOperationsCount () == OPERATIONS_BLOCKED
3308
3323
: "resetting engine without blocking operations; active operations are [" + getActiveOperations () + ']' ;
3309
3324
sync (); // persist the global checkpoint to disk
@@ -3318,41 +3333,46 @@ assert getActiveOperationsCount() == OPERATIONS_BLOCKED
3318
3333
synchronized (engineMutex ) {
3319
3334
// we must create both new read-only engine and new read-write engine under engineMutex to ensure snapshotStoreMetadata,
3320
3335
// acquireXXXCommit and close works.
3321
- Engine readOnlyEngine =
3336
+ final Engine readOnlyEngine =
3322
3337
new ReadOnlyEngine (newEngineConfig (replicationTracker ), seqNoStats , translogStats , false , Function .identity ()) {
3323
3338
@ Override
3324
3339
public IndexCommitRef acquireLastIndexCommit (boolean flushFirst ) {
3325
- synchronized (engineMutex ) {
3340
+ synchronized (mutex ) {
3326
3341
// ignore flushFirst since we flushed above and we do not want to interfere with ongoing translog replay
3327
3342
return newEngineReference .get ().acquireLastIndexCommit (false );
3328
3343
}
3329
3344
}
3330
3345
3331
3346
@ Override
3332
3347
public IndexCommitRef acquireSafeIndexCommit () {
3333
- synchronized (engineMutex ) {
3348
+ synchronized (mutex ) {
3334
3349
return newEngineReference .get ().acquireSafeIndexCommit ();
3335
3350
}
3336
3351
}
3337
3352
3338
3353
@ Override
3339
3354
public void close () throws IOException {
3340
- Engine newEngine ;
3341
- synchronized (engineMutex ) {
3342
- newEngine = newEngineReference .get ();
3343
- if (newEngine == currentEngineReference .get ()) {
3344
- // we successfully installed the new engine so do not close it.
3345
- newEngine = null ;
3346
- }
3355
+ assert Thread .holdsLock (mutex );
3356
+
3357
+ Engine newEngine = newEngineReference .get ();
3358
+ if (newEngine == currentEngineReference .get ()) {
3359
+ // we successfully installed the new engine so do not close it.
3360
+ newEngine = null ;
3347
3361
}
3348
3362
IOUtils .close (super ::close , newEngine );
3349
3363
}
3350
3364
};
3365
+ boolean success = false ;
3351
3366
try {
3352
- currentEngineReference .swapReference (readOnlyEngine );
3353
- readOnlyEngine = null ;
3367
+ synchronized (mutex ) {
3368
+ verifyNotClosed ();
3369
+ IOUtils .close (currentEngineReference .getAndSet (readOnlyEngine ));
3370
+ success = true ;
3371
+ }
3354
3372
} finally {
3355
- IOUtils .close (readOnlyEngine );
3373
+ if (success == false ) {
3374
+ readOnlyEngine .close ();
3375
+ }
3356
3376
}
3357
3377
newEngineReference .set (engineFactory .newReadWriteEngine (newEngineConfig (replicationTracker )));
3358
3378
onNewEngine (newEngineReference .get ());
@@ -3362,14 +3382,9 @@ public void close() throws IOException {
3362
3382
// TODO: add a dedicate recovery stats for the reset translog
3363
3383
});
3364
3384
newEngineReference .get ().recoverFromTranslog (translogRunner , globalCheckpoint );
3365
- synchronized (engineMutex ) {
3366
- Engine newEngine = newEngineReference .get ();
3367
- try {
3368
- currentEngineReference .swapReference (newEngine );
3369
- newEngine = null ;
3370
- } finally {
3371
- IOUtils .close (newEngine );
3372
- }
3385
+ synchronized (mutex ) {
3386
+ verifyNotClosed ();
3387
+ IOUtils .close (currentEngineReference .getAndSet (newEngineReference .get ()));
3373
3388
// We set active because we are now writing operations to the engine; this way,
3374
3389
// if we go idle after some time and become inactive, we still give sync'd flush a chance to run.
3375
3390
active .set (true );
0 commit comments