Skip to content

Commit 5e37e04

Browse files
author
Achille
authored
revisit wire corruption detection (#324)
* revisit wire corruption detection * PR comments
1 parent 5341c5f commit 5e37e04

File tree

1 file changed

+34
-32
lines changed

1 file changed

+34
-32
lines changed

conn.go

+34-32
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ type Conn struct {
4444
// base network connection
4545
conn net.Conn
4646

47+
// number of inflight requests on the connection.
48+
inflight int32
49+
4750
// offset management (synchronized on the mutex field)
4851
mutex sync.Mutex
4952
offset int64
@@ -1236,6 +1239,18 @@ func (c *Conn) writeOperation(write func(time.Time, int32) error, read func(time
12361239
return c.do(&c.wdeadline, write, read)
12371240
}
12381241

1242+
func (c *Conn) enter() {
1243+
atomic.AddInt32(&c.inflight, +1)
1244+
}
1245+
1246+
func (c *Conn) leave() {
1247+
atomic.AddInt32(&c.inflight, -1)
1248+
}
1249+
1250+
func (c *Conn) concurrency() int {
1251+
return int(atomic.LoadInt32(&c.inflight))
1252+
}
1253+
12391254
func (c *Conn) do(d *connDeadline, write func(time.Time, int32) error, read func(time.Time, int) error) error {
12401255
id, err := c.doRequest(d, write)
12411256
if err != nil {
@@ -1261,6 +1276,7 @@ func (c *Conn) do(d *connDeadline, write func(time.Time, int32) error, read func
12611276
}
12621277

12631278
func (c *Conn) doRequest(d *connDeadline, write func(time.Time, int32) error) (id int32, err error) {
1279+
c.enter()
12641280
c.wlock.Lock()
12651281
c.correlationID++
12661282
id = c.correlationID
@@ -1272,67 +1288,53 @@ func (c *Conn) doRequest(d *connDeadline, write func(time.Time, int32) error) (i
12721288
// recoverable state so we're better off just giving up at this point to
12731289
// avoid any risk of corrupting the following operations.
12741290
c.conn.Close()
1291+
c.leave()
12751292
}
12761293

12771294
c.wlock.Unlock()
12781295
return
12791296
}
12801297

12811298
func (c *Conn) waitResponse(d *connDeadline, id int32) (deadline time.Time, size int, lock *sync.Mutex, err error) {
1282-
// I applied exactly zero scientific process to choose this value,
1283-
// it seemed to worked fine in practice tho.
1284-
//
1285-
// My guess is 100 iterations where the goroutine gets descheduled
1286-
// by calling runtime.Gosched() may end up on a wait of ~10ms to ~1s
1287-
// (if the programs is heavily CPU bound and has lots of goroutines),
1288-
// so it should allow for bailing quickly without taking too much risk
1289-
// to get false positives.
1290-
const maxAttempts = 100
1291-
var lastID int32
1292-
1293-
for attempt := 0; attempt < maxAttempts; {
1299+
for {
12941300
var rsz int32
12951301
var rid int32
12961302

12971303
c.rlock.Lock()
12981304
deadline = d.setConnReadDeadline(c.conn)
1305+
rsz, rid, err = c.peekResponseSizeAndID()
12991306

1300-
if rsz, rid, err = c.peekResponseSizeAndID(); err != nil {
1307+
if err != nil {
13011308
d.unsetConnReadDeadline()
13021309
c.conn.Close()
13031310
c.rlock.Unlock()
1304-
return
1311+
break
13051312
}
13061313

13071314
if id == rid {
13081315
c.skipResponseSizeAndID()
13091316
size, lock = int(rsz-4), &c.rlock
1310-
return
1317+
// Don't unlock the read mutex to yield ownership to the caller.
1318+
break
1319+
}
1320+
1321+
if c.concurrency() == 1 {
1322+
// If the goroutine is the only one waiting on this connection it
1323+
// should be impossible to read a correlation id different from the
1324+
// one it expects. This is a sign that the data we are reading on
1325+
// the wire is corrupted and the connection needs to be closed.
1326+
err = io.ErrNoProgress
1327+
c.rlock.Unlock()
1328+
break
13111329
}
13121330

13131331
// Optimistically release the read lock if a response has already
13141332
// been received but the current operation is not the target for it.
13151333
c.rlock.Unlock()
13161334
runtime.Gosched()
1317-
1318-
// This check is a safety mechanism, if we make too many loop
1319-
// iterations and always draw the same id then we could be facing
1320-
// corrupted data on the wire, or the goroutine(s) sharing ownership
1321-
// of this connection may have panicked and therefore will not be able
1322-
// to participate in consuming bytes from the wire. To prevent entering
1323-
// an infinite loop which reads the same value over and over we bail
1324-
// with the uncommon io.ErrNoProgress error which should give a good
1325-
// enough signal about what is going wrong.
1326-
if rid != lastID {
1327-
attempt++
1328-
} else {
1329-
attempt = 0
1330-
}
1331-
1332-
lastID = rid
13331335
}
13341336

1335-
err = io.ErrNoProgress
1337+
c.leave()
13361338
return
13371339
}
13381340

0 commit comments

Comments
 (0)