Skip to content

Commit d9377ea

Browse files
authored
fix: potential deadlock on channel shutdown (#278)
1 parent ebbea58 commit d9377ea

File tree

1 file changed

+91
-38
lines changed

1 file changed

+91
-38
lines changed

transport/graphsync/graphsync.go

+91-38
Original file line numberDiff line numberDiff line change
@@ -952,15 +952,22 @@ func (c *dtChannel) open(
952952
if c.gsKey != nil {
953953
// Cancel the existing graphsync request
954954
completed := c.completed
955-
err := c.cancelRequest(ctx)
955+
errch := c.cancelRequest(ctx)
956+
957+
// Wait for the complete callback to be called
958+
err := waitForCompleteHook(ctx, completed)
956959
if err != nil {
957-
return nil, xerrors.Errorf("restarting graphsync request: %w", err)
960+
return nil, xerrors.Errorf("%s: waiting for cancelled graphsync request to complete: %w", chid, err)
958961
}
959962

960-
// Wait for the cancel to go through
961-
err = waitForCancelComplete(ctx, completed)
963+
// Wait for the cancel request method to complete
964+
select {
965+
case err = <-errch:
966+
case <-ctx.Done():
967+
err = xerrors.Errorf("timed out waiting for graphsync request to be cancelled")
968+
}
962969
if err != nil {
963-
return nil, xerrors.Errorf("waiting for cancelled graphsync request to complete: %w", err)
970+
return nil, xerrors.Errorf("%s: restarting graphsync request: %w", chid, err)
964971
}
965972
}
966973

@@ -1003,7 +1010,7 @@ func (c *dtChannel) open(
10031010
}, nil
10041011
}
10051012

1006-
func waitForCancelComplete(ctx context.Context, completed chan struct{}) error {
1013+
func waitForCompleteHook(ctx context.Context, completed chan struct{}) error {
10071014
// Wait for the cancel to propagate through to graphsync, and for
10081015
// the graphsync request to complete
10091016
select {
@@ -1139,30 +1146,30 @@ func (c *dtChannel) resume(msg datatransfer.Message) error {
11391146
}
11401147

11411148
func (c *dtChannel) close(ctx context.Context) error {
1149+
var errch chan error
11421150
c.lk.Lock()
1143-
defer c.lk.Unlock()
1144-
1145-
// Check if the channel was already cancelled
1146-
if c.gsKey == nil {
1147-
return nil
1148-
}
1149-
1150-
// If it's a graphsync request
1151-
if c.gsKey.p == c.peerID {
1152-
// Cancel the request
1153-
return c.cancelRequest(ctx)
1151+
{
1152+
// Check if the channel was already cancelled
1153+
if c.gsKey != nil {
1154+
// Check whether it's a graphsync request or response
1155+
if c.gsKey.p == c.peerID {
1156+
// Cancel the request
1157+
errch = c.cancelRequest(ctx)
1158+
} else {
1159+
// Cancel the response
1160+
errch = c.cancelResponse()
1161+
}
1162+
}
11541163
}
1164+
c.lk.Unlock()
11551165

1156-
// It's a graphsync response
1157-
1158-
// If the requester already cancelled, bail out
1159-
if c.requesterCancelled {
1160-
return nil
1166+
// Wait for the cancel message to complete
1167+
select {
1168+
case err := <-errch:
1169+
return err
1170+
case <-ctx.Done():
1171+
return ctx.Err()
11611172
}
1162-
1163-
// Cancel the response
1164-
log.Debugf("%s: cancelling response", c.channelID)
1165-
return c.gs.CancelResponse(c.gsKey.p, c.gsKey.requestID)
11661173
}
11671174

11681175
// Called when the responder gets a cancel message from the requester
@@ -1217,34 +1224,80 @@ func (c *dtChannel) cleanup() {
12171224
}
12181225

12191226
func (c *dtChannel) shutdown(ctx context.Context) error {
1227+
// Cancel the graphsync request
12201228
c.lk.Lock()
1221-
defer c.lk.Unlock()
1229+
errch := c.cancelRequest(ctx)
1230+
c.lk.Unlock()
12221231

1223-
// Cancel the graphsync request
1224-
return c.cancelRequest(ctx)
1232+
// Wait for the cancel message to complete
1233+
select {
1234+
case err := <-errch:
1235+
return err
1236+
case <-ctx.Done():
1237+
return ctx.Err()
1238+
}
12251239
}
12261240

12271241
// Cancel the graphsync request.
12281242
// Note: must be called under the lock.
1229-
func (c *dtChannel) cancelRequest(ctx context.Context) error {
1243+
func (c *dtChannel) cancelRequest(ctx context.Context) chan error {
1244+
errch := make(chan error, 1)
1245+
12301246
// Check that the request has not already been cancelled
12311247
if c.gsKey == nil {
1232-
return nil
1248+
errch <- nil
1249+
return errch
12331250
}
12341251

1235-
log.Debugf("%s: cancelling request", c.channelID)
1236-
err := c.gs.CancelRequest(ctx, c.gsKey.requestID)
1237-
if err != nil {
1252+
// Clear the graphsync key to indicate that the request has been cancelled
1253+
gsKey := c.gsKey
1254+
c.gsKey = nil
1255+
1256+
go func() {
1257+
log.Debugf("%s: cancelling request", c.channelID)
1258+
err := c.gs.CancelRequest(ctx, gsKey.requestID)
1259+
12381260
// Ignore "request not found" errors
1239-
if !xerrors.Is(graphsync.RequestNotFoundErr{}, err) {
1240-
return xerrors.Errorf("cancelling graphsync request for channel %s: %w", c.channelID, err)
1261+
if err != nil && !xerrors.Is(graphsync.RequestNotFoundErr{}, err) {
1262+
errch <- xerrors.Errorf("cancelling graphsync request for channel %s: %w", c.channelID, err)
1263+
} else {
1264+
errch <- nil
12411265
}
1266+
}()
1267+
1268+
return errch
1269+
}
1270+
1271+
func (c *dtChannel) cancelResponse() chan error {
1272+
errch := make(chan error, 1)
1273+
1274+
// Check if the requester already sent a cancel message,
1275+
// or the response has already been cancelled
1276+
if c.requesterCancelled || c.gsKey == nil {
1277+
errch <- nil
1278+
return errch
12421279
}
12431280

1244-
// Clear the graphsync key to indicate that the request has been cancelled
1281+
// Clear the graphsync key to indicate that the response has been cancelled
1282+
gsKey := c.gsKey
12451283
c.gsKey = nil
12461284

1247-
return nil
1285+
// Cancel the response in a go-routine to avoid locking when the channel's
1286+
// event queue is drained (potentially calling hooks which take the channel
1287+
// lock)
1288+
go func() {
1289+
log.Debugf("%s: cancelling response", c.channelID)
1290+
err := c.gs.CancelResponse(gsKey.p, gsKey.requestID)
1291+
1292+
// Ignore "request not found" errors
1293+
if err != nil && !xerrors.Is(graphsync.RequestNotFoundErr{}, err) {
1294+
errch <- xerrors.Errorf("%s: cancelling response: %w", c.channelID, err)
1295+
} else {
1296+
errch <- nil
1297+
}
1298+
}()
1299+
1300+
return errch
12481301
}
12491302

12501303
// Used in graphsync callbacks to map from graphsync request to the

0 commit comments

Comments
 (0)