Skip to content

Commit 08d39c6

Browse files
committed
Race condition in TarantoolClientImpl
A state of a client is a set of the following flags: {READING, WRITING, RECONNECT, CLOSED}. Let's name a state when no flags are set UNINITIALIZED. A reader thread sets READING, performs reading until an error or an interruption, drops READING and tries to trigger reconnection (if a state allows, see below). A writer do quite same things, but with the WRITING flag. The key point here is that a reconnection is triggered from a reader/writer thread and only when certain conditions are met. The prerequisite to set RECONNECT and signal (unpark) a connector thread is that a client has UNINITIALIZED state. There are several problems here: - Say, a reader stalls a bit after dropping READING, then a writer drops WRITING and trigger reconnection. Then reader wokes up and set RECONNECT again. - Calling unpark() N times for a connector thread when it is alive doesn't lead to skipping next N park() calls, so the problem above is not just about extra reconnection, but lead the connector thread to be stuck. - Say, a reader stalls just before setting READING. A writer is hit by an IO error and triggers reconnection (set RECONNECT, unpark connector). Then the reader wakes up and set READING+RECONNECT state that disallows a connector thread to proceed further (it expects pure RECONNECT). Even when the reader drops READING it will not wake up (unpark) the connector thread, because RECONNECT was already set (state is not UNINITIALIZED). This commit introduces several changes that eliminate the problems above: - Use ReentrantLock + Condition instead of park() / unpark() to never miss signals to reconnect, does not matter whether a connector is parked. - Ensure a reader and a writer threads from one generation (that are created on the same reconnection iteration) triggers reconnection once. - Hold RECONNECT state most of time a connector works (while acquiring a new socket, connecting and reading Tarantool greeting) and prevent to set READING/WRITING while RECONNECT is set. Fixes: #142 Affects: #34, #136
1 parent 06755d5 commit 08d39c6

File tree

2 files changed

+141
-70
lines changed

2 files changed

+141
-70
lines changed

Diff for: src/main/java/org/tarantool/TarantoolClientImpl.java

+138-66
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
import java.util.concurrent.atomic.AtomicInteger;
2323
import java.util.concurrent.atomic.AtomicReference;
2424
import java.util.concurrent.locks.Condition;
25-
import java.util.concurrent.locks.LockSupport;
2625
import java.util.concurrent.locks.ReentrantLock;
2726

2827

@@ -70,10 +69,12 @@ public class TarantoolClientImpl extends TarantoolBase<Future<?>> implements Tar
7069
@Override
7170
public void run() {
7271
while (!Thread.currentThread().isInterrupted()) {
73-
if (state.compareAndSet(StateHelper.RECONNECT, 0)) {
74-
reconnect(0, thumbstone);
72+
reconnect(0, thumbstone);
73+
try {
74+
state.awaitReconnection();
75+
} catch (InterruptedException e) {
76+
Thread.currentThread().interrupt();
7577
}
76-
LockSupport.park(state);
7778
}
7879
}
7980
});
@@ -139,16 +140,13 @@ protected void reconnect(int retry, Throwable lastError) {
139140
protected void connect(final SocketChannel channel) throws Exception {
140141
try {
141142
TarantoolGreeting greeting = ProtoUtils.connect(channel,
142-
config.username, config.password);
143+
config.username, config.password);
143144
this.serverVersion = greeting.getServerVersion();
144145
} catch (IOException e) {
145-
try {
146-
channel.close();
147-
} catch (IOException ignored) {
148-
}
149-
146+
closeChannel(channel);
150147
throw new CommunicationException("Couldn't connect to tarantool", e);
151148
}
149+
152150
channel.configureBlocking(false);
153151
this.channel = channel;
154152
this.readChannel = new ReadableViaSelectorChannel(channel);
@@ -164,42 +162,42 @@ protected void connect(final SocketChannel channel) throws Exception {
164162
}
165163

166164
protected void startThreads(String threadName) throws InterruptedException {
167-
final CountDownLatch init = new CountDownLatch(2);
168-
reader = new Thread(new Runnable() {
169-
@Override
170-
public void run() {
171-
init.countDown();
172-
if (state.acquire(StateHelper.READING)) {
173-
try {
174-
readThread();
175-
} finally {
176-
state.release(StateHelper.READING);
177-
if (state.compareAndSet(0, StateHelper.RECONNECT))
178-
LockSupport.unpark(connector);
165+
final CountDownLatch ioThreadStarted = new CountDownLatch(2);
166+
final AtomicInteger leftIoThreads = new AtomicInteger(2);
167+
reader = new Thread(() -> {
168+
ioThreadStarted.countDown();
169+
if (state.acquire(StateHelper.READING)) {
170+
try {
171+
readThread();
172+
} finally {
173+
state.release(StateHelper.READING);
174+
// only last of two IO-threads can signal for reconnection
175+
if (leftIoThreads.decrementAndGet() == 0) {
176+
state.trySignalForReconnection();
179177
}
180178
}
181179
}
182180
});
183-
writer = new Thread(new Runnable() {
184-
@Override
185-
public void run() {
186-
init.countDown();
187-
if (state.acquire(StateHelper.WRITING)) {
188-
try {
189-
writeThread();
190-
} finally {
191-
state.release(StateHelper.WRITING);
192-
if (state.compareAndSet(0, StateHelper.RECONNECT))
193-
LockSupport.unpark(connector);
181+
writer = new Thread(() -> {
182+
ioThreadStarted.countDown();
183+
if (state.acquire(StateHelper.WRITING)) {
184+
try {
185+
writeThread();
186+
} finally {
187+
state.release(StateHelper.WRITING);
188+
// only last of two IO-threads can signal for reconnection
189+
if (leftIoThreads.decrementAndGet() == 0) {
190+
state.trySignalForReconnection();
194191
}
195192
}
196193
}
197194
});
195+
state.release(StateHelper.RECONNECT);
198196

199197
configureThreads(threadName);
200198
reader.start();
201199
writer.start();
202-
init.await();
200+
ioThreadStarted.await();
203201
}
204202

205203
protected void configureThreads(String threadName) {
@@ -337,25 +335,21 @@ private boolean directWrite(ByteBuffer buffer) throws InterruptedException, IOEx
337335
}
338336

339337
protected void readThread() {
340-
try {
341-
while (!Thread.currentThread().isInterrupted()) {
342-
try {
343-
TarantoolPacket packet = ProtoUtils.readPacket(readChannel);
338+
while (!Thread.currentThread().isInterrupted()) {
339+
try {
340+
TarantoolPacket packet = ProtoUtils.readPacket(readChannel);
344341

345-
Map<Integer, Object> headers = packet.getHeaders();
342+
Map<Integer, Object> headers = packet.getHeaders();
346343

347-
Long syncId = (Long) headers.get(Key.SYNC.getId());
348-
TarantoolOp<?> future = futures.remove(syncId);
349-
stats.received++;
350-
wait.decrementAndGet();
351-
complete(packet, future);
352-
} catch (Exception e) {
353-
die("Cant read answer", e);
354-
return;
355-
}
344+
Long syncId = (Long) headers.get(Key.SYNC.getId());
345+
TarantoolOp<?> future = futures.remove(syncId);
346+
stats.received++;
347+
wait.decrementAndGet();
348+
complete(packet, future);
349+
} catch (Exception e) {
350+
die("Cant read answer", e);
351+
return;
356352
}
357-
} catch (Exception e) {
358-
die("Cant init thread", e);
359353
}
360354
}
361355

@@ -498,7 +492,7 @@ public TarantoolClientOps<Integer, List<?>, Object, List<?>> syncOps() {
498492

499493
@Override
500494
public TarantoolClientOps<Integer, List<?>, Object, Future<List<?>>> asyncOps() {
501-
return (TarantoolClientOps)this;
495+
return (TarantoolClientOps) this;
502496
}
503497

504498
@Override
@@ -514,7 +508,7 @@ public TarantoolClientOps<Integer, List<?>, Object, Long> fireAndForgetOps() {
514508

515509
@Override
516510
public TarantoolSQLOps<Object, Long, List<Map<String, Object>>> sqlSyncOps() {
517-
return new TarantoolSQLOps<Object, Long, List<Map<String,Object>>>() {
511+
return new TarantoolSQLOps<Object, Long, List<Map<String, Object>>>() {
518512

519513
@Override
520514
public Long update(String sql, Object... bind) {
@@ -530,7 +524,7 @@ public List<Map<String, Object>> query(String sql, Object... bind) {
530524

531525
@Override
532526
public TarantoolSQLOps<Object, Future<Long>, Future<List<Map<String, Object>>>> sqlAsyncOps() {
533-
return new TarantoolSQLOps<Object, Future<Long>, Future<List<Map<String,Object>>>>() {
527+
return new TarantoolSQLOps<Object, Future<Long>, Future<List<Map<String, Object>>>>() {
534528
@Override
535529
public Future<Long> update(String sql, Object... bind) {
536530
return (Future<Long>) exec(Code.EXECUTE, Key.SQL_TEXT, sql, Key.SQL_BIND, bind);
@@ -618,6 +612,7 @@ public TarantoolClientStats getStats() {
618612
* Manages state changes.
619613
*/
620614
protected final class StateHelper {
615+
static final int UNINITIALIZED = 0;
621616
static final int READING = 1;
622617
static final int WRITING = 2;
623618
static final int ALIVE = READING | WRITING;
@@ -627,10 +622,22 @@ protected final class StateHelper {
627622
private final AtomicInteger state;
628623

629624
private final AtomicReference<CountDownLatch> nextAliveLatch =
630-
new AtomicReference<CountDownLatch>(new CountDownLatch(1));
625+
new AtomicReference<>(new CountDownLatch(1));
631626

632627
private final CountDownLatch closedLatch = new CountDownLatch(1);
633628

629+
/**
630+
* The condition variable to signal a reconnection is needed from reader /
631+
* writer threads and waiting for that signal from the reconnection thread.
632+
*
633+
* The lock variable to access this condition.
634+
*
635+
* @see #awaitReconnection()
636+
* @see #trySignalForReconnection()
637+
*/
638+
protected final ReentrantLock connectorLock = new ReentrantLock();
639+
protected final Condition reconnectRequired = connectorLock.newCondition();
640+
634641
protected StateHelper(int state) {
635642
this.state = new AtomicInteger(state);
636643
}
@@ -639,35 +646,60 @@ protected int getState() {
639646
return state.get();
640647
}
641648

649+
/**
650+
* Set CLOSED state, drop RECONNECT state.
651+
*/
642652
protected boolean close() {
643-
for (;;) {
653+
for (; ; ) {
644654
int st = getState();
655+
656+
/* CLOSED is the terminal state. */
645657
if ((st & CLOSED) == CLOSED)
646658
return false;
659+
660+
/* Drop RECONNECT, set CLOSED. */
647661
if (compareAndSet(st, (st & ~RECONNECT) | CLOSED))
648662
return true;
649663
}
650664
}
651665

666+
/**
667+
* Move from a current state to a give one.
668+
*
669+
* Some moves are forbidden.
670+
*/
652671
protected boolean acquire(int mask) {
653-
for (;;) {
654-
int st = getState();
655-
if ((st & CLOSED) == CLOSED)
672+
for (; ; ) {
673+
int currentState = getState();
674+
675+
/* CLOSED is the terminal state. */
676+
if ((currentState & CLOSED) == CLOSED) {
656677
return false;
678+
}
679+
680+
/* Don't move to READING, WRITING or ALIVE from RECONNECT. */
681+
if ((currentState & RECONNECT) > mask) {
682+
return false;
683+
}
657684

658-
if ((st & mask) != 0)
685+
/* Cannot move from a state to the same state. */
686+
if ((currentState & mask) != 0) {
659687
throw new IllegalStateException("State is already " + mask);
688+
}
660689

661-
if (compareAndSet(st, st | mask))
690+
/* Set acquired state. */
691+
if (compareAndSet(currentState, currentState | mask)) {
662692
return true;
693+
}
663694
}
664695
}
665696

666697
protected void release(int mask) {
667-
for (;;) {
698+
for (; ; ) {
668699
int st = getState();
669-
if (compareAndSet(st, st & ~mask))
700+
if (compareAndSet(st, st & ~mask)) {
670701
return;
702+
}
671703
}
672704
}
673705

@@ -686,10 +718,18 @@ protected boolean compareAndSet(int expect, int update) {
686718
return true;
687719
}
688720

721+
/**
722+
* Reconnection uses another way to await state via receiving a signal
723+
* instead of latches.
724+
*/
689725
protected void awaitState(int state) throws InterruptedException {
690-
CountDownLatch latch = getStateLatch(state);
691-
if (latch != null) {
692-
latch.await();
726+
if (state == RECONNECT) {
727+
awaitReconnection();
728+
} else {
729+
CountDownLatch latch = getStateLatch(state);
730+
if (latch != null) {
731+
latch.await();
732+
}
693733
}
694734
}
695735

@@ -709,10 +749,42 @@ private CountDownLatch getStateLatch(int state) {
709749
CountDownLatch latch = nextAliveLatch.get();
710750
/* It may happen so that an error is detected but the state is still alive.
711751
Wait for the 'next' alive state in such cases. */
712-
return (getState() == ALIVE && thumbstone == null) ? null : latch;
752+
return (getState() == ALIVE && thumbstone == null) ? null : latch;
713753
}
714754
return null;
715755
}
756+
757+
/**
758+
* Blocks until a reconnection signal will be received.
759+
*
760+
* @see #trySignalForReconnection()
761+
*/
762+
private void awaitReconnection() throws InterruptedException {
763+
connectorLock.lock();
764+
try {
765+
while (getState() != StateHelper.RECONNECT) {
766+
reconnectRequired.await();
767+
}
768+
} finally {
769+
connectorLock.unlock();
770+
}
771+
}
772+
773+
/**
774+
* Signals to the connector that reconnection process can be performed.
775+
*
776+
* @see #awaitReconnection()
777+
*/
778+
private void trySignalForReconnection() {
779+
if (compareAndSet(StateHelper.UNINITIALIZED, StateHelper.RECONNECT)) {
780+
connectorLock.lock();
781+
try {
782+
reconnectRequired.signal();
783+
} finally {
784+
connectorLock.unlock();
785+
}
786+
}
787+
}
716788
}
717789

718790
protected static class TarantoolOp<V> extends CompletableFuture<V> {

Diff for: src/test/java/org/tarantool/ClientReconnectIT.java

+3-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
import static org.junit.jupiter.api.Assertions.assertEquals;
2222
import static org.junit.jupiter.api.Assertions.assertFalse;
23-
import static org.junit.jupiter.api.Assertions.assertNull;
2423
import static org.junit.jupiter.api.Assertions.assertNotNull;
2524
import static org.junit.jupiter.api.Assertions.assertThrows;
2625
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -227,13 +226,13 @@ public void run() {
227226
public void testLongParallelCloseReconnects() {
228227
int numThreads = 4;
229228
int numClients = 4;
230-
int timeBudget = 30*1000;
229+
int timeBudget = 30 * 1000;
231230

232231
SocketChannelProvider provider = new TestSocketChannelProvider(host,
233232
port, RESTART_TIMEOUT).setSoLinger(0);
234233

235234
final AtomicReferenceArray<TarantoolClient> clients =
236-
new AtomicReferenceArray<TarantoolClient>(numClients);
235+
new AtomicReferenceArray<>(numClients);
237236

238237
for (int idx = 0; idx < clients.length(); idx++) {
239238
clients.set(idx, makeClient(provider));
@@ -301,7 +300,7 @@ public void run() {
301300

302301
// Wait for all threads to finish.
303302
try {
304-
assertTrue(latch.await(RESTART_TIMEOUT, TimeUnit.MILLISECONDS));
303+
assertTrue(latch.await(RESTART_TIMEOUT * 2, TimeUnit.MILLISECONDS));
305304
} catch (InterruptedException e) {
306305
fail(e);
307306
}

0 commit comments

Comments
 (0)