Skip to content

core: use exponential backoff for name resolution #4105

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion core/src/main/java/io/grpc/NameResolver.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@
* <p>The addresses and attributes of a target may be changed over time, thus the caller registers a
* {@link Listener} to receive continuous updates.
*
* <p>A {@code NameResolver} does not need to automatically re-resolve on failure. Instead, the
* {@link Listener} is responsible for eventually (after an appropriate backoff period) invoking
* {@link #refresh()}.
*
* @since 1.0.0
*/
@ExperimentalApi("https://github.com/grpc/grpc-java/issues/1770")
Expand Down Expand Up @@ -136,7 +140,8 @@ public interface Listener {
void onAddresses(List<EquivalentAddressGroup> servers, Attributes attributes);

/**
* Handles an error from the resolver.
* Handles an error from the resolver. The listener is responsible for eventually invoking
* {@link #refresh()} to re-attempt resolution.
*
* @param error a non-OK status
* @since 1.0.0
Expand Down
43 changes: 0 additions & 43 deletions core/src/main/java/io/grpc/internal/DnsNameResolver.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@
import java.util.Collections;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -79,29 +76,22 @@ final class DnsNameResolver extends NameResolver {
private final String authority;
private final String host;
private final int port;
private final Resource<ScheduledExecutorService> timerServiceResource;
private final Resource<ExecutorService> executorResource;
private final ProxyDetector proxyDetector;
@GuardedBy("this")
private boolean shutdown;
@GuardedBy("this")
private ScheduledExecutorService timerService;
@GuardedBy("this")
private ExecutorService executor;
@GuardedBy("this")
private ScheduledFuture<?> resolutionTask;
@GuardedBy("this")
private boolean resolving;
@GuardedBy("this")
private Listener listener;

DnsNameResolver(@Nullable String nsAuthority, String name, Attributes params,
Resource<ScheduledExecutorService> timerServiceResource,
Resource<ExecutorService> executorResource,
ProxyDetector proxyDetector) {
// TODO: if a DNS server is provided as nsAuthority, use it.
// https://www.captechconsulting.com/blogs/accessing-the-dusty-corners-of-dns-with-java
this.timerServiceResource = timerServiceResource;
this.executorResource = executorResource;
// Must prepend a "//" to the name when constructing a URI, otherwise it will be treated as an
// opaque URI, thus the authority and host of the resulted URI would be null.
Expand Down Expand Up @@ -131,7 +121,6 @@ public final String getServiceAuthority() {
@Override
public final synchronized void start(Listener listener) {
Preconditions.checkState(this.listener == null, "already started");
timerService = SharedResourceHolder.get(timerServiceResource);
executor = SharedResourceHolder.get(executorResource);
this.listener = Preconditions.checkNotNull(listener, "listener");
resolve();
Expand All @@ -148,11 +137,6 @@ public final synchronized void refresh() {
public void run() {
Listener savedListener;
synchronized (DnsNameResolver.this) {
// If this task is started by refresh(), there might already be a scheduled task.
if (resolutionTask != null) {
resolutionTask.cancel(false);
resolutionTask = null;
}
if (shutdown) {
return;
}
Expand All @@ -171,16 +155,6 @@ public void run() {
try {
resolvedInetAddrs = delegateResolver.resolve(host);
} catch (Exception e) {
synchronized (DnsNameResolver.this) {
if (shutdown) {
return;
}
// Because timerService is the single-threaded GrpcUtil.TIMER_SERVICE in production,
// we need to delegate the blocking work to the executor
resolutionTask =
timerService.schedule(new LogExceptionRunnable(resolutionRunnableOnExecutor),
1, TimeUnit.MINUTES);
}
savedListener.onError(
Status.UNAVAILABLE.withDescription("Unable to resolve host " + host).withCause(e));
return;
Expand All @@ -207,17 +181,6 @@ public void run() {
}
};

private final Runnable resolutionRunnableOnExecutor = new Runnable() {
@Override
public void run() {
synchronized (DnsNameResolver.this) {
if (!shutdown) {
executor.execute(resolutionRunnable);
}
}
}
};

@GuardedBy("this")
private void resolve() {
if (resolving || shutdown) {
Expand All @@ -232,12 +195,6 @@ public final synchronized void shutdown() {
return;
}
shutdown = true;
if (resolutionTask != null) {
resolutionTask.cancel(false);
}
if (timerService != null) {
timerService = SharedResourceHolder.release(timerServiceResource, timerService);
}
if (executor != null) {
executor = SharedResourceHolder.release(executorResource, executor);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,12 @@ public DnsNameResolver newNameResolver(URI targetUri, Attributes params) {
Preconditions.checkArgument(targetPath.startsWith("/"),
"the path component (%s) of the target (%s) must start with '/'", targetPath, targetUri);
String name = targetPath.substring(1);
return new DnsNameResolver(targetUri.getAuthority(), name, params, GrpcUtil.TIMER_SERVICE,
GrpcUtil.SHARED_CHANNEL_EXECUTOR, GrpcUtil.getProxyDetector());
return new DnsNameResolver(
targetUri.getAuthority(),
name,
params,
GrpcUtil.SHARED_CHANNEL_EXECUTOR,
GrpcUtil.getProxyDetector());
} else {
return null;
}
Expand Down
128 changes: 100 additions & 28 deletions core/src/main/java/io/grpc/internal/ManagedChannelImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ private void shutdownNameResolverAndLoadBalancer(boolean verifyActive) {
checkState(lbHelper != null, "lbHelper is null");
}
if (nameResolver != null) {
cancelNameResolverBackoff();
nameResolver.shutdown();
nameResolver = null;
nameResolverStarted = false;
Expand Down Expand Up @@ -429,6 +430,46 @@ public void run() {
idleTimeoutMillis, TimeUnit.MILLISECONDS);
}

// Run from channelExecutor
@VisibleForTesting
class NameResolverRefresh implements Runnable {
// Only mutated from channelExecutor
boolean cancelled;

@Override
public void run() {
if (cancelled) {
// Race detected: this task was scheduled on channelExecutor before
// cancelNameResolverBackoff() could cancel the timer.
return;
}
nameResolverRefreshFuture = null;
nameResolverRefresh = null;
if (nameResolver != null) {
nameResolver.refresh();
}
}
}

// Must be used from channelExecutor
@Nullable private ScheduledFuture<?> nameResolverRefreshFuture;
// Must be used from channelExecutor
@Nullable private NameResolverRefresh nameResolverRefresh;
// The policy to control backoff between name resolution attempts. Non-null when an attempt is
// scheduled. Must be used from channelExecutor
@Nullable private BackoffPolicy nameResolverBackoffPolicy;

// Must be run from channelExecutor
private void cancelNameResolverBackoff() {
if (nameResolverRefreshFuture != null) {
nameResolverRefreshFuture.cancel(false);
nameResolverRefresh.cancelled = true;
nameResolverRefreshFuture = null;
nameResolverRefresh = null;
nameResolverBackoffPolicy = null;
}
}

private final ClientTransportProvider transportProvider = new ClientTransportProvider() {
@Override
public ClientTransport get(PickSubchannelArgs args) {
Expand Down Expand Up @@ -799,24 +840,28 @@ public void run() {

@Override
public void resetConnectBackoff() {
channelExecutor.executeLater(
new Runnable() {
@Override
public void run() {
if (shutdown.get()) {
return;
}
if (nameResolverStarted) {
nameResolver.refresh();
}
for (InternalSubchannel subchannel : subchannels) {
subchannel.resetConnectBackoff();
}
for (OobChannel oobChannel : oobChannels) {
oobChannel.resetConnectBackoff();
}
}
}).drain();
channelExecutor
.executeLater(
new Runnable() {
@Override
public void run() {
if (shutdown.get()) {
return;
}
if (nameResolverRefreshFuture != null) {
checkState(nameResolverStarted, "name resolver must be started");
cancelNameResolverBackoff();
nameResolver.refresh();
}
for (InternalSubchannel subchannel : subchannels) {
subchannel.resetConnectBackoff();
}
for (OobChannel oobChannel : oobChannels) {
oobChannel.resetConnectBackoff();
}
}
})
.drain();
}

@Override
Expand Down Expand Up @@ -1132,6 +1177,8 @@ public void run() {
return;
}

nameResolverBackoffPolicy = null;

try {
if (retryEnabled) {
retryPolicies = getRetryPolicies(config);
Expand All @@ -1156,16 +1203,41 @@ public void onError(final Status error) {
checkArgument(!error.isOk(), "the error status must not be OK");
logger.log(Level.WARNING, "[{0}] Failed to resolve name. status={1}",
new Object[] {getLogId(), error});
channelExecutor.executeLater(new Runnable() {
@Override
public void run() {
// Call LB only if it's not shutdown. If LB is shutdown, lbHelper won't match.
if (NameResolverListenerImpl.this.helper != ManagedChannelImpl.this.lbHelper) {
return;
}
balancer.handleNameResolutionError(error);
}
}).drain();
channelExecutor
.executeLater(
new Runnable() {
@Override
public void run() {
// Call LB only if it's not shutdown. If LB is shutdown, lbHelper won't match.
if (NameResolverListenerImpl.this.helper != ManagedChannelImpl.this.lbHelper) {
return;
}
balancer.handleNameResolutionError(error);
if (nameResolverRefreshFuture != null) {
// The name resolver may invoke onError multiple times, but we only want to
// schedule one backoff attempt
// TODO(ericgribkoff) Update contract of NameResolver.Listener or decide if we
// want to reset the backoff interval upon repeated onError() calls
return;
}
if (nameResolverBackoffPolicy == null) {
nameResolverBackoffPolicy = backoffPolicyProvider.get();
}
long delayNanos = nameResolverBackoffPolicy.nextBackoffNanos();
if (logger.isLoggable(Level.FINE)) {
logger.log(
Level.FINE,
"[{0}] Scheduling DNS resolution backoff for {1} ns",
new Object[] {logId, delayNanos});
}
nameResolverRefresh = new NameResolverRefresh();
nameResolverRefreshFuture =
transportFactory
.getScheduledExecutorService()
.schedule(nameResolverRefresh, delayNanos, TimeUnit.NANOSECONDS);
}
})
.drain();
}
}

Expand Down
Loading