Skip to content

Commit 8066e23

Browse files
authored
Use MultiFileTransfer in CCR remote recovery (#44514)
Relates #44468
1 parent 458de91 commit 8066e23

File tree

4 files changed

+106
-93
lines changed

4 files changed

+106
-93
lines changed

server/src/main/java/org/elasticsearch/indices/recovery/MultiFileTransfer.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
* one of the networking threads which receive/handle the responses of the current pending file chunk requests. This process will continue
5858
* until all chunk requests are sent/responded.
5959
*/
60-
abstract class MultiFileTransfer<Request extends MultiFileTransfer.ChunkRequest> implements Closeable {
60+
public abstract class MultiFileTransfer<Request extends MultiFileTransfer.ChunkRequest> implements Closeable {
6161
private Status status = Status.PROCESSING;
6262
private final Logger logger;
6363
private final ActionListener<Void> listener;
@@ -121,7 +121,7 @@ private void handleItems(List<Tuple<FileChunkResponseItem, Consumer<Exception>>>
121121
return;
122122
}
123123
final long requestSeqId = requestSeqIdTracker.generateSeqNo();
124-
sendChunkRequest(request.v2(), ActionListener.wrap(
124+
executeChunkRequest(request.v2(), ActionListener.wrap(
125125
r -> addItem(requestSeqId, request.v1(), null),
126126
e -> addItem(requestSeqId, request.v1(), e)));
127127
}
@@ -179,7 +179,7 @@ private Tuple<StoreFileMetaData, Request> getNextRequest() throws Exception {
179179

180180
protected abstract Request nextChunkRequest(StoreFileMetaData md) throws IOException;
181181

182-
protected abstract void sendChunkRequest(Request request, ActionListener<Void> listener);
182+
protected abstract void executeChunkRequest(Request request, ActionListener<Void> listener);
183183

184184
protected abstract void handleError(StoreFileMetaData md, Exception e) throws Exception;
185185

@@ -195,7 +195,7 @@ private static class FileChunkResponseItem {
195195
}
196196
}
197197

198-
protected interface ChunkRequest {
198+
public interface ChunkRequest {
199199
/**
200200
* @return {@code true} if this chunk request is the last chunk of the current file
201201
*/

server/src/main/java/org/elasticsearch/indices/recovery/MultiFileWriter.java

+14-1
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@
2727
import org.elasticsearch.common.Strings;
2828
import org.elasticsearch.common.bytes.BytesReference;
2929
import org.elasticsearch.common.lease.Releasable;
30+
import org.elasticsearch.common.util.concurrent.AbstractRefCounted;
3031
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
3132
import org.elasticsearch.index.store.Store;
3233
import org.elasticsearch.index.store.StoreFileMetaData;
34+
import org.elasticsearch.transport.Transports;
3335

3436
import java.io.IOException;
3537
import java.util.Arrays;
@@ -39,10 +41,12 @@
3941
import java.util.Map;
4042
import java.util.PriorityQueue;
4143
import java.util.concurrent.ConcurrentMap;
44+
import java.util.concurrent.atomic.AtomicBoolean;
4245

43-
public class MultiFileWriter implements Releasable {
46+
public class MultiFileWriter extends AbstractRefCounted implements Releasable {
4447

4548
public MultiFileWriter(Store store, RecoveryState.Index indexState, String tempFilePrefix, Logger logger, Runnable ensureOpen) {
49+
super("multi_file_writer");
4650
this.store = store;
4751
this.indexState = indexState;
4852
this.tempFilePrefix = tempFilePrefix;
@@ -51,6 +55,7 @@ public MultiFileWriter(Store store, RecoveryState.Index indexState, String tempF
5155
}
5256

5357
private final Runnable ensureOpen;
58+
private final AtomicBoolean closed = new AtomicBoolean(false);
5459
private final Logger logger;
5560
private final Store store;
5661
private final RecoveryState.Index indexState;
@@ -64,6 +69,7 @@ public MultiFileWriter(Store store, RecoveryState.Index indexState, String tempF
6469

6570
public void writeFileChunk(StoreFileMetaData fileMetaData, long position, BytesReference content, boolean lastChunk)
6671
throws IOException {
72+
assert Transports.assertNotTransportThread("multi_file_writer");
6773
final FileChunkWriter writer = fileChunkWriters.computeIfAbsent(fileMetaData.name(), name -> new FileChunkWriter());
6874
writer.writeChunk(new FileChunk(fileMetaData, content, position, lastChunk));
6975
}
@@ -138,6 +144,13 @@ private void innerWriteFileChunk(StoreFileMetaData fileMetaData, long position,
138144

139145
@Override
140146
public void close() {
147+
if (closed.compareAndSet(false, true)) {
148+
decRef();
149+
}
150+
}
151+
152+
@Override
153+
protected void closeInternal() {
141154
fileChunkWriters.clear();
142155
// clean open index outputs
143156
Iterator<Map.Entry<String, IndexOutput>> iterator = openIndexOutputs.entrySet().iterator();

server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,7 @@ protected FileChunk nextChunkRequest(StoreFileMetaData md) throws IOException {
890890
}
891891

892892
@Override
893-
protected void sendChunkRequest(FileChunk request, ActionListener<Void> listener) {
893+
protected void executeChunkRequest(FileChunk request, ActionListener<Void> listener) {
894894
cancellableThreads.checkForCancel();
895895
recoveryTarget.writeFileChunk(
896896
request.md, request.position, request.content, request.lastChunk, translogOps.getAsInt(), listener);

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/repository/CcrRepository.java

+87-87
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import org.apache.logging.log4j.Logger;
1313
import org.apache.logging.log4j.message.ParameterizedMessage;
1414
import org.apache.lucene.index.IndexCommit;
15-
import org.elasticsearch.ElasticsearchException;
1615
import org.elasticsearch.ElasticsearchSecurityException;
1716
import org.elasticsearch.ExceptionsHelper;
1817
import org.elasticsearch.action.ActionListener;
@@ -21,6 +20,7 @@
2120
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
2221
import org.elasticsearch.action.support.ListenerTimeouts;
2322
import org.elasticsearch.action.support.PlainActionFuture;
23+
import org.elasticsearch.action.support.ThreadedActionListener;
2424
import org.elasticsearch.client.Client;
2525
import org.elasticsearch.cluster.ClusterName;
2626
import org.elasticsearch.cluster.metadata.IndexMetaData;
@@ -31,18 +31,16 @@
3131
import org.elasticsearch.common.Strings;
3232
import org.elasticsearch.common.UUIDs;
3333
import org.elasticsearch.common.collect.ImmutableOpenMap;
34-
import org.elasticsearch.common.collect.Tuple;
3534
import org.elasticsearch.common.component.AbstractLifecycleComponent;
35+
import org.elasticsearch.common.lease.Releasable;
3636
import org.elasticsearch.common.metrics.CounterMetric;
3737
import org.elasticsearch.common.settings.Settings;
3838
import org.elasticsearch.common.unit.ByteSizeValue;
3939
import org.elasticsearch.common.unit.TimeValue;
40-
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
4140
import org.elasticsearch.common.util.concurrent.ThreadContext;
4241
import org.elasticsearch.index.Index;
4342
import org.elasticsearch.index.engine.EngineException;
4443
import org.elasticsearch.index.mapper.MapperService;
45-
import org.elasticsearch.index.seqno.LocalCheckpointTracker;
4644
import org.elasticsearch.index.seqno.RetentionLeaseAlreadyExistsException;
4745
import org.elasticsearch.index.seqno.RetentionLeaseInvalidRetainingSeqNoException;
4846
import org.elasticsearch.index.seqno.RetentionLeaseNotFoundException;
@@ -54,6 +52,7 @@
5452
import org.elasticsearch.index.snapshots.blobstore.SnapshotFiles;
5553
import org.elasticsearch.index.store.Store;
5654
import org.elasticsearch.index.store.StoreFileMetaData;
55+
import org.elasticsearch.indices.recovery.MultiFileTransfer;
5756
import org.elasticsearch.indices.recovery.MultiFileWriter;
5857
import org.elasticsearch.indices.recovery.RecoveryState;
5958
import org.elasticsearch.repositories.IndexId;
@@ -87,12 +86,11 @@
8786
import java.util.Map;
8887
import java.util.Optional;
8988
import java.util.Set;
90-
import java.util.concurrent.atomic.AtomicReference;
9189
import java.util.function.LongConsumer;
9290
import java.util.function.Supplier;
91+
import java.util.stream.Collectors;
9392

9493
import static org.elasticsearch.index.seqno.RetentionLeaseActions.RETAIN_ALL;
95-
import static org.elasticsearch.index.seqno.SequenceNumbers.NO_OPS_PERFORMED;
9694
import static org.elasticsearch.xpack.ccr.CcrRetentionLeases.retentionLeaseId;
9795
import static org.elasticsearch.xpack.ccr.CcrRetentionLeases.syncAddRetentionLease;
9896
import static org.elasticsearch.xpack.ccr.CcrRetentionLeases.syncRenewRetentionLease;
@@ -473,97 +471,82 @@ void restoreFiles(Store store) {
473471
}
474472

475473
@Override
476-
protected void restoreFiles(List<FileInfo> filesToRecover, Store store) throws IOException {
474+
protected void restoreFiles(List<FileInfo> filesToRecover, Store store) {
477475
logger.trace("[{}] starting CCR restore of {} files", shardId, filesToRecover);
476+
final PlainActionFuture<Void> restoreFilesFuture = new PlainActionFuture<>();
477+
final List<StoreFileMetaData> mds = filesToRecover.stream().map(FileInfo::metadata).collect(Collectors.toList());
478+
final MultiFileTransfer<FileChunk> multiFileTransfer = new MultiFileTransfer<>(
479+
logger, threadPool.getThreadContext(), restoreFilesFuture, ccrSettings.getMaxConcurrentFileChunks(), mds) {
478480

479-
try (MultiFileWriter multiFileWriter = new MultiFileWriter(store, recoveryState.getIndex(), "", logger, () -> {
480-
})) {
481-
final LocalCheckpointTracker requestSeqIdTracker = new LocalCheckpointTracker(NO_OPS_PERFORMED, NO_OPS_PERFORMED);
482-
final AtomicReference<Tuple<StoreFileMetaData, Exception>> error = new AtomicReference<>();
481+
final MultiFileWriter multiFileWriter = new MultiFileWriter(store, recoveryState.getIndex(), "", logger, () -> {});
482+
long offset = 0;
483483

484-
for (FileInfo fileInfo : filesToRecover) {
485-
final long fileLength = fileInfo.length();
486-
long offset = 0;
487-
while (offset < fileLength && error.get() == null) {
488-
final long requestSeqId = requestSeqIdTracker.generateSeqNo();
489-
try {
490-
requestSeqIdTracker.waitForProcessedOpsToComplete(requestSeqId - ccrSettings.getMaxConcurrentFileChunks());
491-
492-
if (error.get() != null) {
493-
requestSeqIdTracker.markSeqNoAsProcessed(requestSeqId);
494-
break;
495-
}
496-
497-
final int bytesRequested = Math.toIntExact(
498-
Math.min(ccrSettings.getChunkSize().getBytes(), fileLength - offset));
499-
offset += bytesRequested;
500-
501-
final GetCcrRestoreFileChunkRequest request =
502-
new GetCcrRestoreFileChunkRequest(node, sessionUUID, fileInfo.name(), bytesRequested);
503-
logger.trace("[{}] [{}] fetching chunk for file [{}], expected offset: {}, size: {}", shardId, snapshotId,
504-
fileInfo.name(), offset, bytesRequested);
505-
506-
TimeValue timeout = ccrSettings.getRecoveryActionTimeout();
507-
ActionListener<GetCcrRestoreFileChunkAction.GetCcrRestoreFileChunkResponse> listener =
508-
ListenerTimeouts.wrapWithTimeout(threadPool, ActionListener.wrap(
509-
r -> threadPool.generic().execute(new AbstractRunnable() {
510-
@Override
511-
public void onFailure(Exception e) {
512-
error.compareAndSet(null, Tuple.tuple(fileInfo.metadata(), e));
513-
requestSeqIdTracker.markSeqNoAsProcessed(requestSeqId);
514-
}
515-
516-
@Override
517-
protected void doRun() throws Exception {
518-
final int actualChunkSize = r.getChunk().length();
519-
logger.trace("[{}] [{}] got response for file [{}], offset: {}, length: {}", shardId,
520-
snapshotId, fileInfo.name(), r.getOffset(), actualChunkSize);
521-
final long nanosPaused = ccrSettings.getRateLimiter().maybePause(actualChunkSize);
522-
throttleListener.accept(nanosPaused);
523-
final boolean lastChunk = r.getOffset() + actualChunkSize >= fileLength;
524-
multiFileWriter.writeFileChunk(fileInfo.metadata(), r.getOffset(), r.getChunk(), lastChunk);
525-
requestSeqIdTracker.markSeqNoAsProcessed(requestSeqId);
526-
}
527-
}),
528-
e -> {
529-
error.compareAndSet(null, Tuple.tuple(fileInfo.metadata(), e));
530-
requestSeqIdTracker.markSeqNoAsProcessed(requestSeqId);
531-
}
532-
), timeout, ThreadPool.Names.GENERIC, GetCcrRestoreFileChunkAction.NAME);
533-
remoteClient.execute(GetCcrRestoreFileChunkAction.INSTANCE, request, listener);
534-
} catch (Exception e) {
535-
error.compareAndSet(null, Tuple.tuple(fileInfo.metadata(), e));
536-
requestSeqIdTracker.markSeqNoAsProcessed(requestSeqId);
537-
}
538-
}
484+
@Override
485+
protected void onNewFile(StoreFileMetaData md) {
486+
offset = 0;
539487
}
540488

541-
try {
542-
requestSeqIdTracker.waitForProcessedOpsToComplete(requestSeqIdTracker.getMaxSeqNo());
543-
} catch (InterruptedException e) {
544-
Thread.currentThread().interrupt();
545-
throw new ElasticsearchException(e);
489+
@Override
490+
protected FileChunk nextChunkRequest(StoreFileMetaData md) {
491+
final int bytesRequested = Math.toIntExact(Math.min(ccrSettings.getChunkSize().getBytes(), md.length() - offset));
492+
offset += bytesRequested;
493+
return new FileChunk(md, bytesRequested, offset == md.length());
546494
}
547-
if (error.get() != null) {
548-
handleError(store, error.get().v2());
495+
496+
@Override
497+
protected void executeChunkRequest(FileChunk request, ActionListener<Void> listener) {
498+
final ActionListener<GetCcrRestoreFileChunkAction.GetCcrRestoreFileChunkResponse> threadedListener
499+
= new ThreadedActionListener<>(logger, threadPool, ThreadPool.Names.GENERIC, ActionListener.wrap(
500+
r -> {
501+
writeFileChunk(request.md, r);
502+
listener.onResponse(null);
503+
}, listener::onFailure), false);
504+
505+
remoteClient.execute(GetCcrRestoreFileChunkAction.INSTANCE,
506+
new GetCcrRestoreFileChunkRequest(node, sessionUUID, request.md.name(), request.bytesRequested),
507+
ListenerTimeouts.wrapWithTimeout(threadPool, threadedListener, ccrSettings.getRecoveryActionTimeout(),
508+
ThreadPool.Names.GENERIC, GetCcrRestoreFileChunkAction.NAME));
549509
}
550-
}
551510

552-
logger.trace("[{}] completed CCR restore", shardId);
553-
}
511+
private void writeFileChunk(StoreFileMetaData md,
512+
GetCcrRestoreFileChunkAction.GetCcrRestoreFileChunkResponse r) throws Exception {
513+
final int actualChunkSize = r.getChunk().length();
514+
logger.trace("[{}] [{}] got response for file [{}], offset: {}, length: {}",
515+
shardId, snapshotId, md.name(), r.getOffset(), actualChunkSize);
516+
final long nanosPaused = ccrSettings.getRateLimiter().maybePause(actualChunkSize);
517+
throttleListener.accept(nanosPaused);
518+
multiFileWriter.incRef();
519+
try (Releasable ignored = multiFileWriter::decRef) {
520+
final boolean lastChunk = r.getOffset() + actualChunkSize >= md.length();
521+
multiFileWriter.writeFileChunk(md, r.getOffset(), r.getChunk(), lastChunk);
522+
} catch (Exception e) {
523+
handleError(md, e);
524+
throw e;
525+
}
526+
}
527+
528+
@Override
529+
protected void handleError(StoreFileMetaData md, Exception e) throws Exception {
530+
final IOException corruptIndexException;
531+
if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
532+
try {
533+
store.markStoreCorrupted(corruptIndexException);
534+
} catch (IOException ioe) {
535+
logger.warn("store cannot be marked as corrupted", e);
536+
}
537+
throw corruptIndexException;
538+
}
539+
throw e;
540+
}
554541

555-
private void handleError(Store store, Exception e) throws IOException {
556-
final IOException corruptIndexException;
557-
if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
558-
try {
559-
store.markStoreCorrupted(corruptIndexException);
560-
} catch (IOException ioe) {
561-
logger.warn("store cannot be marked as corrupted", e);
542+
@Override
543+
public void close() {
544+
multiFileWriter.close();
562545
}
563-
throw corruptIndexException;
564-
} else {
565-
ExceptionsHelper.reThrowIfNotNull(e);
566-
}
546+
};
547+
multiFileTransfer.start();
548+
restoreFilesFuture.actionGet();
549+
logger.trace("[{}] completed CCR restore", shardId);
567550
}
568551

569552
@Override
@@ -572,5 +555,22 @@ public void close() {
572555
ClearCcrRestoreSessionAction.ClearCcrRestoreSessionResponse response =
573556
remoteClient.execute(ClearCcrRestoreSessionAction.INSTANCE, clearRequest).actionGet(ccrSettings.getRecoveryActionTimeout());
574557
}
558+
559+
private static class FileChunk implements MultiFileTransfer.ChunkRequest {
560+
final StoreFileMetaData md;
561+
final int bytesRequested;
562+
final boolean lastChunk;
563+
564+
FileChunk(StoreFileMetaData md, int bytesRequested, boolean lastChunk) {
565+
this.md = md;
566+
this.bytesRequested = bytesRequested;
567+
this.lastChunk = lastChunk;
568+
}
569+
570+
@Override
571+
public boolean lastChunk() {
572+
return lastChunk;
573+
}
574+
}
575575
}
576576
}

0 commit comments

Comments
 (0)