Skip to content

Commit 858db1a

Browse files
committed
http2: surface errors occurring very early in a client conn's lifetime
When we create a new connection for a request, the request should fail if the connection attempt fails. There is a race condition which can cause this to not happen: - net/http sends a request to a http2.Transport - the http2.Transport returns ErrNoCachedConn - net/http creates a new tls.Conn and passes it to the http2.Transport - the http2.Transport adds the conn to its connection pool - the connection immediately encounters an error - the http2.Transport removes the conn from its connection pool - net/http resends the request to the http2.Transport - the http2.Transport returns ErrNoCachedConn, and the process repeates If the request is sent to the http2.Transport before the connection encounters an error, then the request fails. But otherwise, we get stuck in an infinite loop of the http2.Transport asking for a new connection, receiving one, and throwing it away. To fix this, leave a dead connection in the pool for a short while if it has never had a request sent to it. If a dead connection is used for a new request, return an error and remove the connection from the pool. Change-Id: I64eb15a8f1512a6bda52db423072b945fab6f4b5 Reviewed-on: https://go-review.googlesource.com/c/net/+/625398 Reviewed-by: Brad Fitzpatrick <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Jonathan Amsterdam <[email protected]>
1 parent 0aa844c commit 858db1a

File tree

4 files changed

+219
-20
lines changed

4 files changed

+219
-20
lines changed

http2/clientconn_test.go

+23-7
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ package http2
1010
import (
1111
"bytes"
1212
"context"
13+
"crypto/tls"
1314
"fmt"
1415
"io"
1516
"net/http"
@@ -112,27 +113,40 @@ func newTestClientConnFromClientConn(t *testing.T, cc *ClientConn) *testClientCo
112113
cc: cc,
113114
group: cc.t.transportTestHooks.group.(*synctestGroup),
114115
}
115-
cli, srv := synctestNetPipe(tc.group)
116+
117+
// srv is the side controlled by the test.
118+
var srv *synctestNetConn
119+
if cc.tconn == nil {
120+
// If cc.tconn is nil, we're being called with a new conn created by the
121+
// Transport's client pool. This path skips dialing the server, and we
122+
// create a test connection pair here.
123+
cc.tconn, srv = synctestNetPipe(tc.group)
124+
} else {
125+
// If cc.tconn is non-nil, we're in a test which provides a conn to the
126+
// Transport via a TLSNextProto hook. Extract the test connection pair.
127+
if tc, ok := cc.tconn.(*tls.Conn); ok {
128+
// Unwrap any *tls.Conn to the underlying net.Conn,
129+
// to avoid dealing with encryption in tests.
130+
cc.tconn = tc.NetConn()
131+
}
132+
srv = cc.tconn.(*synctestNetConn).peer
133+
}
134+
116135
srv.SetReadDeadline(tc.group.Now())
117136
srv.autoWait = true
118137
tc.netconn = srv
119138
tc.enc = hpack.NewEncoder(&tc.encbuf)
120-
121-
// all writes and reads are finished.
122-
//
123-
// cli is the ClientConn's side, srv is the side controlled by the test.
124-
cc.tconn = cli
125139
tc.fr = NewFramer(srv, srv)
126140
tc.testConnFramer = testConnFramer{
127141
t: t,
128142
fr: tc.fr,
129143
dec: hpack.NewDecoder(initialHeaderTableSize, nil),
130144
}
131-
132145
tc.fr.SetMaxReadFrameSize(10 << 20)
133146
t.Cleanup(func() {
134147
tc.closeWrite()
135148
})
149+
136150
return tc
137151
}
138152

@@ -503,6 +517,8 @@ func newTestTransport(t *testing.T, opts ...any) *testTransport {
503517
o(tr.t1)
504518
case func(*Transport):
505519
o(tr)
520+
case *Transport:
521+
tr = o
506522
}
507523
}
508524
tt.tr = tr

http2/netconn_test.go

+8-2
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,11 @@ func synctestNetPipe(group *synctestGroup) (r, w *synctestNetConn) {
2828
s2addr := net.TCPAddrFromAddrPort(netip.MustParseAddrPort("127.0.0.1:8001"))
2929
s1 := newSynctestNetConnHalf(s1addr)
3030
s2 := newSynctestNetConnHalf(s2addr)
31-
return &synctestNetConn{group: group, loc: s1, rem: s2},
32-
&synctestNetConn{group: group, loc: s2, rem: s1}
31+
r = &synctestNetConn{group: group, loc: s1, rem: s2}
32+
w = &synctestNetConn{group: group, loc: s2, rem: s1}
33+
r.peer = w
34+
w.peer = r
35+
return r, w
3336
}
3437

3538
// A synctestNetConn is one endpoint of the connection created by synctestNetPipe.
@@ -43,6 +46,9 @@ type synctestNetConn struct {
4346

4447
// When set, group.Wait is automatically called before reads and after writes.
4548
autoWait bool
49+
50+
// peer is the other endpoint.
51+
peer *synctestNetConn
4652
}
4753

4854
// Read reads data from the connection.

http2/transport.go

+75-11
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,20 @@ func (t *Transport) markNewGoroutine() {
202202
}
203203
}
204204

205+
func (t *Transport) now() time.Time {
206+
if t != nil && t.transportTestHooks != nil {
207+
return t.transportTestHooks.group.Now()
208+
}
209+
return time.Now()
210+
}
211+
212+
func (t *Transport) timeSince(when time.Time) time.Duration {
213+
if t != nil && t.transportTestHooks != nil {
214+
return t.now().Sub(when)
215+
}
216+
return time.Since(when)
217+
}
218+
205219
// newTimer creates a new time.Timer, or a synthetic timer in tests.
206220
func (t *Transport) newTimer(d time.Duration) timer {
207221
if t.transportTestHooks != nil {
@@ -343,7 +357,7 @@ type ClientConn struct {
343357
t *Transport
344358
tconn net.Conn // usually *tls.Conn, except specialized impls
345359
tlsState *tls.ConnectionState // nil only for specialized impls
346-
reused uint32 // whether conn is being reused; atomic
360+
atomicReused uint32 // whether conn is being reused; atomic
347361
singleUse bool // whether being used for a single http.Request
348362
getConnCalled bool // used by clientConnPool
349363

@@ -609,7 +623,7 @@ func (t *Transport) RoundTripOpt(req *http.Request, opt RoundTripOpt) (*http.Res
609623
t.vlogf("http2: Transport failed to get client conn for %s: %v", addr, err)
610624
return nil, err
611625
}
612-
reused := !atomic.CompareAndSwapUint32(&cc.reused, 0, 1)
626+
reused := !atomic.CompareAndSwapUint32(&cc.atomicReused, 0, 1)
613627
traceGotConn(req, cc, reused)
614628
res, err := cc.RoundTrip(req)
615629
if err != nil && retry <= 6 {
@@ -634,6 +648,22 @@ func (t *Transport) RoundTripOpt(req *http.Request, opt RoundTripOpt) (*http.Res
634648
}
635649
}
636650
}
651+
if err == errClientConnNotEstablished {
652+
// This ClientConn was created recently,
653+
// this is the first request to use it,
654+
// and the connection is closed and not usable.
655+
//
656+
// In this state, cc.idleTimer will remove the conn from the pool
657+
// when it fires. Stop the timer and remove it here so future requests
658+
// won't try to use this connection.
659+
//
660+
// If the timer has already fired and we're racing it, the redundant
661+
// call to MarkDead is harmless.
662+
if cc.idleTimer != nil {
663+
cc.idleTimer.Stop()
664+
}
665+
t.connPool().MarkDead(cc)
666+
}
637667
if err != nil {
638668
t.vlogf("RoundTrip failure: %v", err)
639669
return nil, err
@@ -652,9 +682,10 @@ func (t *Transport) CloseIdleConnections() {
652682
}
653683

654684
var (
655-
errClientConnClosed = errors.New("http2: client conn is closed")
656-
errClientConnUnusable = errors.New("http2: client conn not usable")
657-
errClientConnGotGoAway = errors.New("http2: Transport received Server's graceful shutdown GOAWAY")
685+
errClientConnClosed = errors.New("http2: client conn is closed")
686+
errClientConnUnusable = errors.New("http2: client conn not usable")
687+
errClientConnNotEstablished = errors.New("http2: client conn could not be established")
688+
errClientConnGotGoAway = errors.New("http2: Transport received Server's graceful shutdown GOAWAY")
658689
)
659690

660691
// shouldRetryRequest is called by RoundTrip when a request fails to get
@@ -793,6 +824,7 @@ func (t *Transport) newClientConn(c net.Conn, singleUse bool) (*ClientConn, erro
793824
pingTimeout: conf.PingTimeout,
794825
pings: make(map[[8]byte]chan struct{}),
795826
reqHeaderMu: make(chan struct{}, 1),
827+
lastActive: t.now(),
796828
}
797829
var group synctestGroupInterface
798830
if t.transportTestHooks != nil {
@@ -1041,6 +1073,16 @@ func (cc *ClientConn) idleStateLocked() (st clientConnIdleState) {
10411073
!cc.doNotReuse &&
10421074
int64(cc.nextStreamID)+2*int64(cc.pendingRequests) < math.MaxInt32 &&
10431075
!cc.tooIdleLocked()
1076+
1077+
// If this connection has never been used for a request and is closed,
1078+
// then let it take a request (which will fail).
1079+
//
1080+
// This avoids a situation where an error early in a connection's lifetime
1081+
// goes unreported.
1082+
if cc.nextStreamID == 1 && cc.streamsReserved == 0 && cc.closed {
1083+
st.canTakeNewRequest = true
1084+
}
1085+
10441086
return
10451087
}
10461088

@@ -1062,7 +1104,7 @@ func (cc *ClientConn) tooIdleLocked() bool {
10621104
// times are compared based on their wall time. We don't want
10631105
// to reuse a connection that's been sitting idle during
10641106
// VM/laptop suspend if monotonic time was also frozen.
1065-
return cc.idleTimeout != 0 && !cc.lastIdle.IsZero() && time.Since(cc.lastIdle.Round(0)) > cc.idleTimeout
1107+
return cc.idleTimeout != 0 && !cc.lastIdle.IsZero() && cc.t.timeSince(cc.lastIdle.Round(0)) > cc.idleTimeout
10661108
}
10671109

10681110
// onIdleTimeout is called from a time.AfterFunc goroutine. It will
@@ -1706,7 +1748,12 @@ func (cs *clientStream) cleanupWriteRequest(err error) {
17061748
// Must hold cc.mu.
17071749
func (cc *ClientConn) awaitOpenSlotForStreamLocked(cs *clientStream) error {
17081750
for {
1709-
cc.lastActive = time.Now()
1751+
if cc.closed && cc.nextStreamID == 1 && cc.streamsReserved == 0 {
1752+
// This is the very first request sent to this connection.
1753+
// Return a fatal error which aborts the retry loop.
1754+
return errClientConnNotEstablished
1755+
}
1756+
cc.lastActive = cc.t.now()
17101757
if cc.closed || !cc.canTakeNewRequestLocked() {
17111758
return errClientConnUnusable
17121759
}
@@ -2253,10 +2300,10 @@ func (cc *ClientConn) forgetStreamID(id uint32) {
22532300
if len(cc.streams) != slen-1 {
22542301
panic("forgetting unknown stream id")
22552302
}
2256-
cc.lastActive = time.Now()
2303+
cc.lastActive = cc.t.now()
22572304
if len(cc.streams) == 0 && cc.idleTimer != nil {
22582305
cc.idleTimer.Reset(cc.idleTimeout)
2259-
cc.lastIdle = time.Now()
2306+
cc.lastIdle = cc.t.now()
22602307
}
22612308
// Wake up writeRequestBody via clientStream.awaitFlowControl and
22622309
// wake up RoundTrip if there is a pending request.
@@ -2316,7 +2363,6 @@ func isEOFOrNetReadError(err error) bool {
23162363

23172364
func (rl *clientConnReadLoop) cleanup() {
23182365
cc := rl.cc
2319-
cc.t.connPool().MarkDead(cc)
23202366
defer cc.closeConn()
23212367
defer close(cc.readerDone)
23222368

@@ -2340,6 +2386,24 @@ func (rl *clientConnReadLoop) cleanup() {
23402386
}
23412387
cc.closed = true
23422388

2389+
// If the connection has never been used, and has been open for only a short time,
2390+
// leave it in the connection pool for a little while.
2391+
//
2392+
// This avoids a situation where new connections are constantly created,
2393+
// added to the pool, fail, and are removed from the pool, without any error
2394+
// being surfaced to the user.
2395+
const unusedWaitTime = 5 * time.Second
2396+
idleTime := cc.t.now().Sub(cc.lastActive)
2397+
if atomic.LoadUint32(&cc.atomicReused) == 0 && idleTime < unusedWaitTime {
2398+
cc.idleTimer = cc.t.afterFunc(unusedWaitTime-idleTime, func() {
2399+
cc.t.connPool().MarkDead(cc)
2400+
})
2401+
} else {
2402+
cc.mu.Unlock() // avoid any deadlocks in MarkDead
2403+
cc.t.connPool().MarkDead(cc)
2404+
cc.mu.Lock()
2405+
}
2406+
23432407
for _, cs := range cc.streams {
23442408
select {
23452409
case <-cs.peerClosed:
@@ -3332,7 +3396,7 @@ func traceGotConn(req *http.Request, cc *ClientConn, reused bool) {
33323396
cc.mu.Lock()
33333397
ci.WasIdle = len(cc.streams) == 0 && reused
33343398
if ci.WasIdle && !cc.lastActive.IsZero() {
3335-
ci.IdleTime = time.Since(cc.lastActive)
3399+
ci.IdleTime = cc.t.timeSince(cc.lastActive)
33363400
}
33373401
cc.mu.Unlock()
33383402

http2/transport_test.go

+113
Original file line numberDiff line numberDiff line change
@@ -5638,3 +5638,116 @@ func TestTransportConnBecomesUnresponsive(t *testing.T) {
56385638
rt2.wantStatus(200)
56395639
rt2.response().Body.Close()
56405640
}
5641+
5642+
// Test that the Transport can use a conn provided to it by a TLSNextProto hook.
5643+
func TestTransportTLSNextProtoConnOK(t *testing.T) {
5644+
t1 := &http.Transport{}
5645+
t2, _ := ConfigureTransports(t1)
5646+
tt := newTestTransport(t, t2)
5647+
5648+
// Create a new, fake connection and pass it to the Transport via the TLSNextProto hook.
5649+
cli, _ := synctestNetPipe(tt.group)
5650+
cliTLS := tls.Client(cli, tlsConfigInsecure)
5651+
go func() {
5652+
tt.group.Join()
5653+
t1.TLSNextProto["h2"]("dummy.tld", cliTLS)
5654+
}()
5655+
tt.sync()
5656+
tc := tt.getConn()
5657+
tc.greet()
5658+
5659+
// Send a request on the Transport.
5660+
// It uses the conn we provided.
5661+
req := must(http.NewRequest("GET", "https://dummy.tld/", nil))
5662+
rt := tt.roundTrip(req)
5663+
tc.wantHeaders(wantHeader{
5664+
streamID: 1,
5665+
endStream: true,
5666+
header: http.Header{
5667+
":authority": []string{"dummy.tld"},
5668+
":method": []string{"GET"},
5669+
":path": []string{"/"},
5670+
},
5671+
})
5672+
tc.writeHeaders(HeadersFrameParam{
5673+
StreamID: 1,
5674+
EndHeaders: true,
5675+
EndStream: true,
5676+
BlockFragment: tc.makeHeaderBlockFragment(
5677+
":status", "200",
5678+
),
5679+
})
5680+
rt.wantStatus(200)
5681+
rt.wantBody(nil)
5682+
}
5683+
5684+
// Test the case where a conn provided via a TLSNextProto hook immediately encounters an error.
5685+
func TestTransportTLSNextProtoConnImmediateFailureUsed(t *testing.T) {
5686+
t1 := &http.Transport{}
5687+
t2, _ := ConfigureTransports(t1)
5688+
tt := newTestTransport(t, t2)
5689+
5690+
// Create a new, fake connection and pass it to the Transport via the TLSNextProto hook.
5691+
cli, _ := synctestNetPipe(tt.group)
5692+
cliTLS := tls.Client(cli, tlsConfigInsecure)
5693+
go func() {
5694+
tt.group.Join()
5695+
t1.TLSNextProto["h2"]("dummy.tld", cliTLS)
5696+
}()
5697+
tt.sync()
5698+
tc := tt.getConn()
5699+
5700+
// The connection encounters an error before we send a request that uses it.
5701+
tc.closeWrite()
5702+
5703+
// Send a request on the Transport.
5704+
//
5705+
// It should fail, because we have no usable connections, but not with ErrNoCachedConn.
5706+
req := must(http.NewRequest("GET", "https://dummy.tld/", nil))
5707+
rt := tt.roundTrip(req)
5708+
if err := rt.err(); err == nil || errors.Is(err, ErrNoCachedConn) {
5709+
t.Fatalf("RoundTrip with broken conn: got %v, want an error other than ErrNoCachedConn", err)
5710+
}
5711+
5712+
// Send the request again.
5713+
// This time it should fail with ErrNoCachedConn,
5714+
// because the dead conn has been removed from the pool.
5715+
rt = tt.roundTrip(req)
5716+
if err := rt.err(); !errors.Is(err, ErrNoCachedConn) {
5717+
t.Fatalf("RoundTrip after broken conn is used: got %v, want ErrNoCachedConn", err)
5718+
}
5719+
}
5720+
5721+
// Test the case where a conn provided via a TLSNextProto hook immediately encounters an error,
5722+
// but no requests are sent which would use the bad connection.
5723+
func TestTransportTLSNextProtoConnImmediateFailureUnused(t *testing.T) {
5724+
t1 := &http.Transport{}
5725+
t2, _ := ConfigureTransports(t1)
5726+
tt := newTestTransport(t, t2)
5727+
5728+
// Create a new, fake connection and pass it to the Transport via the TLSNextProto hook.
5729+
cli, _ := synctestNetPipe(tt.group)
5730+
cliTLS := tls.Client(cli, tlsConfigInsecure)
5731+
go func() {
5732+
tt.group.Join()
5733+
t1.TLSNextProto["h2"]("dummy.tld", cliTLS)
5734+
}()
5735+
tt.sync()
5736+
tc := tt.getConn()
5737+
5738+
// The connection encounters an error before we send a request that uses it.
5739+
tc.closeWrite()
5740+
5741+
// Some time passes.
5742+
// The dead connection is removed from the pool.
5743+
tc.advance(10 * time.Second)
5744+
5745+
// Send a request on the Transport.
5746+
//
5747+
// It should fail with ErrNoCachedConn, because the pool contains no conns.
5748+
req := must(http.NewRequest("GET", "https://dummy.tld/", nil))
5749+
rt := tt.roundTrip(req)
5750+
if err := rt.err(); !errors.Is(err, ErrNoCachedConn) {
5751+
t.Fatalf("RoundTrip after broken conn expires: got %v, want ErrNoCachedConn", err)
5752+
}
5753+
}

0 commit comments

Comments
 (0)