Skip to content

Commit d0a4bad

Browse files
committed
Use MultiFileTransfer in CCR remote recovery (#44514)
Relates #44468
1 parent 547e399 commit d0a4bad

File tree

4 files changed

+106
-93
lines changed

4 files changed

+106
-93
lines changed

server/src/main/java/org/elasticsearch/indices/recovery/MultiFileTransfer.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
* one of the networking threads which receive/handle the responses of the current pending file chunk requests. This process will continue
5858
* until all chunk requests are sent/responded.
5959
*/
60-
abstract class MultiFileTransfer<Request extends MultiFileTransfer.ChunkRequest> implements Closeable {
60+
public abstract class MultiFileTransfer<Request extends MultiFileTransfer.ChunkRequest> implements Closeable {
6161
private Status status = Status.PROCESSING;
6262
private final Logger logger;
6363
private final ActionListener<Void> listener;
@@ -121,7 +121,7 @@ private void handleItems(List<Tuple<FileChunkResponseItem, Consumer<Exception>>>
121121
return;
122122
}
123123
final long requestSeqId = requestSeqIdTracker.generateSeqNo();
124-
sendChunkRequest(request.v2(), ActionListener.wrap(
124+
executeChunkRequest(request.v2(), ActionListener.wrap(
125125
r -> addItem(requestSeqId, request.v1(), null),
126126
e -> addItem(requestSeqId, request.v1(), e)));
127127
}
@@ -179,7 +179,7 @@ private Tuple<StoreFileMetaData, Request> getNextRequest() throws Exception {
179179

180180
protected abstract Request nextChunkRequest(StoreFileMetaData md) throws IOException;
181181

182-
protected abstract void sendChunkRequest(Request request, ActionListener<Void> listener);
182+
protected abstract void executeChunkRequest(Request request, ActionListener<Void> listener);
183183

184184
protected abstract void handleError(StoreFileMetaData md, Exception e) throws Exception;
185185

@@ -195,7 +195,7 @@ private static class FileChunkResponseItem {
195195
}
196196
}
197197

198-
protected interface ChunkRequest {
198+
public interface ChunkRequest {
199199
/**
200200
* @return {@code true} if this chunk request is the last chunk of the current file
201201
*/

server/src/main/java/org/elasticsearch/indices/recovery/MultiFileWriter.java

+14-1
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@
2727
import org.elasticsearch.common.Strings;
2828
import org.elasticsearch.common.bytes.BytesReference;
2929
import org.elasticsearch.common.lease.Releasable;
30+
import org.elasticsearch.common.util.concurrent.AbstractRefCounted;
3031
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
3132
import org.elasticsearch.index.store.Store;
3233
import org.elasticsearch.index.store.StoreFileMetaData;
34+
import org.elasticsearch.transport.Transports;
3335

3436
import java.io.IOException;
3537
import java.util.Arrays;
@@ -39,10 +41,12 @@
3941
import java.util.Map;
4042
import java.util.PriorityQueue;
4143
import java.util.concurrent.ConcurrentMap;
44+
import java.util.concurrent.atomic.AtomicBoolean;
4245

43-
public class MultiFileWriter implements Releasable {
46+
public class MultiFileWriter extends AbstractRefCounted implements Releasable {
4447

4548
public MultiFileWriter(Store store, RecoveryState.Index indexState, String tempFilePrefix, Logger logger, Runnable ensureOpen) {
49+
super("multi_file_writer");
4650
this.store = store;
4751
this.indexState = indexState;
4852
this.tempFilePrefix = tempFilePrefix;
@@ -51,6 +55,7 @@ public MultiFileWriter(Store store, RecoveryState.Index indexState, String tempF
5155
}
5256

5357
private final Runnable ensureOpen;
58+
private final AtomicBoolean closed = new AtomicBoolean(false);
5459
private final Logger logger;
5560
private final Store store;
5661
private final RecoveryState.Index indexState;
@@ -64,6 +69,7 @@ public MultiFileWriter(Store store, RecoveryState.Index indexState, String tempF
6469

6570
public void writeFileChunk(StoreFileMetaData fileMetaData, long position, BytesReference content, boolean lastChunk)
6671
throws IOException {
72+
assert Transports.assertNotTransportThread("multi_file_writer");
6773
final FileChunkWriter writer = fileChunkWriters.computeIfAbsent(fileMetaData.name(), name -> new FileChunkWriter());
6874
writer.writeChunk(new FileChunk(fileMetaData, content, position, lastChunk));
6975
}
@@ -138,6 +144,13 @@ private void innerWriteFileChunk(StoreFileMetaData fileMetaData, long position,
138144

139145
@Override
140146
public void close() {
147+
if (closed.compareAndSet(false, true)) {
148+
decRef();
149+
}
150+
}
151+
152+
@Override
153+
protected void closeInternal() {
141154
fileChunkWriters.clear();
142155
// clean open index outputs
143156
Iterator<Map.Entry<String, IndexOutput>> iterator = openIndexOutputs.entrySet().iterator();

server/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,7 @@ protected FileChunk nextChunkRequest(StoreFileMetaData md) throws IOException {
893893
}
894894

895895
@Override
896-
protected void sendChunkRequest(FileChunk request, ActionListener<Void> listener) {
896+
protected void executeChunkRequest(FileChunk request, ActionListener<Void> listener) {
897897
cancellableThreads.checkForCancel();
898898
recoveryTarget.writeFileChunk(
899899
request.md, request.position, request.content, request.lastChunk, translogOps.getAsInt(), listener);

x-pack/plugin/ccr/src/main/java/org/elasticsearch/xpack/ccr/repository/CcrRepository.java

+87-87
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import org.apache.logging.log4j.Logger;
1313
import org.apache.logging.log4j.message.ParameterizedMessage;
1414
import org.apache.lucene.index.IndexCommit;
15-
import org.elasticsearch.ElasticsearchException;
1615
import org.elasticsearch.ElasticsearchSecurityException;
1716
import org.elasticsearch.ExceptionsHelper;
1817
import org.elasticsearch.action.ActionListener;
@@ -21,6 +20,7 @@
2120
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
2221
import org.elasticsearch.action.support.ListenerTimeouts;
2322
import org.elasticsearch.action.support.PlainActionFuture;
23+
import org.elasticsearch.action.support.ThreadedActionListener;
2424
import org.elasticsearch.client.Client;
2525
import org.elasticsearch.cluster.ClusterName;
2626
import org.elasticsearch.cluster.metadata.IndexMetaData;
@@ -31,18 +31,16 @@
3131
import org.elasticsearch.common.Strings;
3232
import org.elasticsearch.common.UUIDs;
3333
import org.elasticsearch.common.collect.ImmutableOpenMap;
34-
import org.elasticsearch.common.collect.Tuple;
3534
import org.elasticsearch.common.component.AbstractLifecycleComponent;
35+
import org.elasticsearch.common.lease.Releasable;
3636
import org.elasticsearch.common.metrics.CounterMetric;
3737
import org.elasticsearch.common.settings.Settings;
3838
import org.elasticsearch.common.unit.ByteSizeValue;
3939
import org.elasticsearch.common.unit.TimeValue;
40-
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
4140
import org.elasticsearch.common.util.concurrent.ThreadContext;
4241
import org.elasticsearch.index.Index;
4342
import org.elasticsearch.index.engine.EngineException;
4443
import org.elasticsearch.index.mapper.MapperService;
45-
import org.elasticsearch.index.seqno.LocalCheckpointTracker;
4644
import org.elasticsearch.index.seqno.RetentionLeaseAlreadyExistsException;
4745
import org.elasticsearch.index.seqno.RetentionLeaseInvalidRetainingSeqNoException;
4846
import org.elasticsearch.index.seqno.RetentionLeaseNotFoundException;
@@ -54,6 +52,7 @@
5452
import org.elasticsearch.index.snapshots.blobstore.SnapshotFiles;
5553
import org.elasticsearch.index.store.Store;
5654
import org.elasticsearch.index.store.StoreFileMetaData;
55+
import org.elasticsearch.indices.recovery.MultiFileTransfer;
5756
import org.elasticsearch.indices.recovery.MultiFileWriter;
5857
import org.elasticsearch.indices.recovery.RecoveryState;
5958
import org.elasticsearch.repositories.IndexId;
@@ -87,12 +86,11 @@
8786
import java.util.Map;
8887
import java.util.Optional;
8988
import java.util.Set;
90-
import java.util.concurrent.atomic.AtomicReference;
9189
import java.util.function.LongConsumer;
9290
import java.util.function.Supplier;
91+
import java.util.stream.Collectors;
9392

9493
import static org.elasticsearch.index.seqno.RetentionLeaseActions.RETAIN_ALL;
95-
import static org.elasticsearch.index.seqno.SequenceNumbers.NO_OPS_PERFORMED;
9694
import static org.elasticsearch.xpack.ccr.CcrRetentionLeases.retentionLeaseId;
9795
import static org.elasticsearch.xpack.ccr.CcrRetentionLeases.syncAddRetentionLease;
9896
import static org.elasticsearch.xpack.ccr.CcrRetentionLeases.syncRenewRetentionLease;
@@ -477,97 +475,82 @@ void restoreFiles(Store store) {
477475
}
478476

479477
@Override
480-
protected void restoreFiles(List<FileInfo> filesToRecover, Store store) throws IOException {
478+
protected void restoreFiles(List<FileInfo> filesToRecover, Store store) {
481479
logger.trace("[{}] starting CCR restore of {} files", shardId, filesToRecover);
480+
final PlainActionFuture<Void> restoreFilesFuture = new PlainActionFuture<>();
481+
final List<StoreFileMetaData> mds = filesToRecover.stream().map(FileInfo::metadata).collect(Collectors.toList());
482+
final MultiFileTransfer<FileChunk> multiFileTransfer = new MultiFileTransfer<FileChunk>(
483+
logger, threadPool.getThreadContext(), restoreFilesFuture, ccrSettings.getMaxConcurrentFileChunks(), mds) {
482484

483-
try (MultiFileWriter multiFileWriter = new MultiFileWriter(store, recoveryState.getIndex(), "", logger, () -> {
484-
})) {
485-
final LocalCheckpointTracker requestSeqIdTracker = new LocalCheckpointTracker(NO_OPS_PERFORMED, NO_OPS_PERFORMED);
486-
final AtomicReference<Tuple<StoreFileMetaData, Exception>> error = new AtomicReference<>();
485+
final MultiFileWriter multiFileWriter = new MultiFileWriter(store, recoveryState.getIndex(), "", logger, () -> {});
486+
long offset = 0;
487487

488-
for (FileInfo fileInfo : filesToRecover) {
489-
final long fileLength = fileInfo.length();
490-
long offset = 0;
491-
while (offset < fileLength && error.get() == null) {
492-
final long requestSeqId = requestSeqIdTracker.generateSeqNo();
493-
try {
494-
requestSeqIdTracker.waitForProcessedOpsToComplete(requestSeqId - ccrSettings.getMaxConcurrentFileChunks());
495-
496-
if (error.get() != null) {
497-
requestSeqIdTracker.markSeqNoAsProcessed(requestSeqId);
498-
break;
499-
}
500-
501-
final int bytesRequested = Math.toIntExact(
502-
Math.min(ccrSettings.getChunkSize().getBytes(), fileLength - offset));
503-
offset += bytesRequested;
504-
505-
final GetCcrRestoreFileChunkRequest request =
506-
new GetCcrRestoreFileChunkRequest(node, sessionUUID, fileInfo.name(), bytesRequested);
507-
logger.trace("[{}] [{}] fetching chunk for file [{}], expected offset: {}, size: {}", shardId, snapshotId,
508-
fileInfo.name(), offset, bytesRequested);
509-
510-
TimeValue timeout = ccrSettings.getRecoveryActionTimeout();
511-
ActionListener<GetCcrRestoreFileChunkAction.GetCcrRestoreFileChunkResponse> listener =
512-
ListenerTimeouts.wrapWithTimeout(threadPool, ActionListener.wrap(
513-
r -> threadPool.generic().execute(new AbstractRunnable() {
514-
@Override
515-
public void onFailure(Exception e) {
516-
error.compareAndSet(null, Tuple.tuple(fileInfo.metadata(), e));
517-
requestSeqIdTracker.markSeqNoAsProcessed(requestSeqId);
518-
}
519-
520-
@Override
521-
protected void doRun() throws Exception {
522-
final int actualChunkSize = r.getChunk().length();
523-
logger.trace("[{}] [{}] got response for file [{}], offset: {}, length: {}", shardId,
524-
snapshotId, fileInfo.name(), r.getOffset(), actualChunkSize);
525-
final long nanosPaused = ccrSettings.getRateLimiter().maybePause(actualChunkSize);
526-
throttleListener.accept(nanosPaused);
527-
final boolean lastChunk = r.getOffset() + actualChunkSize >= fileLength;
528-
multiFileWriter.writeFileChunk(fileInfo.metadata(), r.getOffset(), r.getChunk(), lastChunk);
529-
requestSeqIdTracker.markSeqNoAsProcessed(requestSeqId);
530-
}
531-
}),
532-
e -> {
533-
error.compareAndSet(null, Tuple.tuple(fileInfo.metadata(), e));
534-
requestSeqIdTracker.markSeqNoAsProcessed(requestSeqId);
535-
}
536-
), timeout, ThreadPool.Names.GENERIC, GetCcrRestoreFileChunkAction.NAME);
537-
remoteClient.execute(GetCcrRestoreFileChunkAction.INSTANCE, request, listener);
538-
} catch (Exception e) {
539-
error.compareAndSet(null, Tuple.tuple(fileInfo.metadata(), e));
540-
requestSeqIdTracker.markSeqNoAsProcessed(requestSeqId);
541-
}
542-
}
488+
@Override
489+
protected void onNewFile(StoreFileMetaData md) {
490+
offset = 0;
543491
}
544492

545-
try {
546-
requestSeqIdTracker.waitForProcessedOpsToComplete(requestSeqIdTracker.getMaxSeqNo());
547-
} catch (InterruptedException e) {
548-
Thread.currentThread().interrupt();
549-
throw new ElasticsearchException(e);
493+
@Override
494+
protected FileChunk nextChunkRequest(StoreFileMetaData md) {
495+
final int bytesRequested = Math.toIntExact(Math.min(ccrSettings.getChunkSize().getBytes(), md.length() - offset));
496+
offset += bytesRequested;
497+
return new FileChunk(md, bytesRequested, offset == md.length());
550498
}
551-
if (error.get() != null) {
552-
handleError(store, error.get().v2());
499+
500+
@Override
501+
protected void executeChunkRequest(FileChunk request, ActionListener<Void> listener) {
502+
final ActionListener<GetCcrRestoreFileChunkAction.GetCcrRestoreFileChunkResponse> threadedListener
503+
= new ThreadedActionListener<>(logger, threadPool, ThreadPool.Names.GENERIC, ActionListener.wrap(
504+
r -> {
505+
writeFileChunk(request.md, r);
506+
listener.onResponse(null);
507+
}, listener::onFailure), false);
508+
509+
remoteClient.execute(GetCcrRestoreFileChunkAction.INSTANCE,
510+
new GetCcrRestoreFileChunkRequest(node, sessionUUID, request.md.name(), request.bytesRequested),
511+
ListenerTimeouts.wrapWithTimeout(threadPool, threadedListener, ccrSettings.getRecoveryActionTimeout(),
512+
ThreadPool.Names.GENERIC, GetCcrRestoreFileChunkAction.NAME));
553513
}
554-
}
555514

556-
logger.trace("[{}] completed CCR restore", shardId);
557-
}
515+
private void writeFileChunk(StoreFileMetaData md,
516+
GetCcrRestoreFileChunkAction.GetCcrRestoreFileChunkResponse r) throws Exception {
517+
final int actualChunkSize = r.getChunk().length();
518+
logger.trace("[{}] [{}] got response for file [{}], offset: {}, length: {}",
519+
shardId, snapshotId, md.name(), r.getOffset(), actualChunkSize);
520+
final long nanosPaused = ccrSettings.getRateLimiter().maybePause(actualChunkSize);
521+
throttleListener.accept(nanosPaused);
522+
multiFileWriter.incRef();
523+
try (Releasable ignored = multiFileWriter::decRef) {
524+
final boolean lastChunk = r.getOffset() + actualChunkSize >= md.length();
525+
multiFileWriter.writeFileChunk(md, r.getOffset(), r.getChunk(), lastChunk);
526+
} catch (Exception e) {
527+
handleError(md, e);
528+
throw e;
529+
}
530+
}
531+
532+
@Override
533+
protected void handleError(StoreFileMetaData md, Exception e) throws Exception {
534+
final IOException corruptIndexException;
535+
if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
536+
try {
537+
store.markStoreCorrupted(corruptIndexException);
538+
} catch (IOException ioe) {
539+
logger.warn("store cannot be marked as corrupted", e);
540+
}
541+
throw corruptIndexException;
542+
}
543+
throw e;
544+
}
558545

559-
private void handleError(Store store, Exception e) throws IOException {
560-
final IOException corruptIndexException;
561-
if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
562-
try {
563-
store.markStoreCorrupted(corruptIndexException);
564-
} catch (IOException ioe) {
565-
logger.warn("store cannot be marked as corrupted", e);
546+
@Override
547+
public void close() {
548+
multiFileWriter.close();
566549
}
567-
throw corruptIndexException;
568-
} else {
569-
ExceptionsHelper.reThrowIfNotNull(e);
570-
}
550+
};
551+
multiFileTransfer.start();
552+
restoreFilesFuture.actionGet();
553+
logger.trace("[{}] completed CCR restore", shardId);
571554
}
572555

573556
@Override
@@ -576,5 +559,22 @@ public void close() {
576559
ClearCcrRestoreSessionAction.ClearCcrRestoreSessionResponse response =
577560
remoteClient.execute(ClearCcrRestoreSessionAction.INSTANCE, clearRequest).actionGet(ccrSettings.getRecoveryActionTimeout());
578561
}
562+
563+
private static class FileChunk implements MultiFileTransfer.ChunkRequest {
564+
final StoreFileMetaData md;
565+
final int bytesRequested;
566+
final boolean lastChunk;
567+
568+
FileChunk(StoreFileMetaData md, int bytesRequested, boolean lastChunk) {
569+
this.md = md;
570+
this.bytesRequested = bytesRequested;
571+
this.lastChunk = lastChunk;
572+
}
573+
574+
@Override
575+
public boolean lastChunk() {
576+
return lastChunk;
577+
}
578+
}
579579
}
580580
}

0 commit comments

Comments
 (0)