Skip to content

Commit 94a1d0c

Browse files
committed
*: LeaseTimeToLive returns error if leader changed
The old leader demotes lessor and all the leases' expire time will be updated. Instead of returning incorrect remaining TTL, we should return errors to force client retry. Cherry-pick: d3bb6f6 Signed-off-by: Wei Fu <[email protected]>
1 parent b78b214 commit 94a1d0c

File tree

8 files changed

+112
-1
lines changed

8 files changed

+112
-1
lines changed

build.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ GOFAIL_VERSION=$(cd tools/mod && go list -m -f '{{.Version}}' go.etcd.io/gofail)
2727
toggle_failpoints() {
2828
mode="$1"
2929
if command -v gofail >/dev/null 2>&1; then
30-
run gofail "$mode" server/etcdserver/ server/mvcc/ server/wal/ server/mvcc/backend/
30+
run gofail "$mode" server/etcdserver/ server/lease/leasehttp server/mvcc/ server/wal/ server/mvcc/backend/
3131
if [[ "$mode" == "enable" ]]; then
3232
go get go.etcd.io/gofail@"${GOFAIL_VERSION}"
3333
cd ./server && go get go.etcd.io/gofail@"${GOFAIL_VERSION}"

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,8 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec
310310
go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
311311
go.etcd.io/bbolt v1.3.9 h1:8x7aARPEXiXbHmtUwAIv7eV2fQFHrLLavdiJ3uzJXoI=
312312
go.etcd.io/bbolt v1.3.9/go.mod h1:zaO32+Ti0PK1ivdPtgMESzuzL2VPoIG1PCQNvOdo/dE=
313+
go.etcd.io/gofail v0.1.0 h1:XItAMIhOojXFQMgrxjnd2EIIHun/d5qL0Pf7FzVTkFg=
314+
go.etcd.io/gofail v0.1.0/go.mod h1:VZBCXYGZhHAinaBiiqYvuDynvahNsAyLFwB3kEHKz1M=
313315
go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
314316
go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
315317
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.0 h1:PzIubN4/sjByhDRHLviCjJuweBXWFZWhghjg7cS28+M=

server/etcdserver/v3_server.go

+12
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,9 @@ func (s *EtcdServer) leaseTimeToLive(ctx context.Context, r *pb.LeaseTimeToLiveR
377377
if err := s.waitAppliedIndex(); err != nil {
378378
return nil, err
379379
}
380+
381+
// gofail: var beforeLookupWhenLeaseTimeToLive struct{}
382+
380383
// primary; timetolive directly from leader
381384
le := s.lessor.Lookup(lease.LeaseID(r.ID))
382385
if le == nil {
@@ -392,6 +395,15 @@ func (s *EtcdServer) leaseTimeToLive(ctx context.Context, r *pb.LeaseTimeToLiveR
392395
}
393396
resp.Keys = kbs
394397
}
398+
399+
// The leasor could be demoted if leader changed during lookup.
400+
// We should return error to force retry instead of returning
401+
// incorrect remaining TTL.
402+
if le.Demoted() {
403+
// NOTE: lease.ErrNotPrimary is not retryable error for
404+
// client. Instead, uses ErrLeaderChanged.
405+
return nil, ErrLeaderChanged
406+
}
395407
return resp, nil
396408
}
397409

server/lease/leasehttp/http.go

+11
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ func (h *leaseHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
103103
http.Error(w, ErrLeaseHTTPTimeout.Error(), http.StatusRequestTimeout)
104104
return
105105
}
106+
107+
// gofail: var beforeLookupWhenForwardLeaseTimeToLive struct{}
108+
106109
l := h.l.Lookup(lease.LeaseID(lreq.LeaseTimeToLiveRequest.ID))
107110
if l == nil {
108111
http.Error(w, lease.ErrLeaseNotFound.Error(), http.StatusNotFound)
@@ -126,6 +129,14 @@ func (h *leaseHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
126129
resp.LeaseTimeToLiveResponse.Keys = kbs
127130
}
128131

132+
// The leasor could be demoted if leader changed during lookup.
133+
// We should return error to force retry instead of returning
134+
// incorrect remaining TTL.
135+
if l.Demoted() {
136+
http.Error(w, lease.ErrNotPrimary.Error(), http.StatusInternalServerError)
137+
return
138+
}
139+
129140
v, err = resp.Marshal()
130141
if err != nil {
131142
http.Error(w, err.Error(), http.StatusInternalServerError)

server/lease/lessor.go

+7
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,13 @@ func (l *Lease) forever() {
900900
l.expiry = forever
901901
}
902902

903+
// Demoted returns true if the lease's expiry has been reset to forever.
904+
func (l *Lease) Demoted() bool {
905+
l.expiryMu.Lock()
906+
defer l.expiryMu.Unlock()
907+
return l.expiry == forever
908+
}
909+
903910
// Keys returns all the keys attached to the lease.
904911
func (l *Lease) Keys() []string {
905912
l.mu.RLock()

tests/go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ require (
3636
go.etcd.io/etcd/pkg/v3 v3.5.13
3737
go.etcd.io/etcd/raft/v3 v3.5.13
3838
go.etcd.io/etcd/server/v3 v3.5.13
39+
go.etcd.io/gofail v0.1.0
3940
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.0
4041
go.opentelemetry.io/otel v1.20.0
4142
go.opentelemetry.io/otel/sdk v1.20.0

tests/go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,8 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec
304304
go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
305305
go.etcd.io/bbolt v1.3.9 h1:8x7aARPEXiXbHmtUwAIv7eV2fQFHrLLavdiJ3uzJXoI=
306306
go.etcd.io/bbolt v1.3.9/go.mod h1:zaO32+Ti0PK1ivdPtgMESzuzL2VPoIG1PCQNvOdo/dE=
307+
go.etcd.io/gofail v0.1.0 h1:XItAMIhOojXFQMgrxjnd2EIIHun/d5qL0Pf7FzVTkFg=
308+
go.etcd.io/gofail v0.1.0/go.mod h1:VZBCXYGZhHAinaBiiqYvuDynvahNsAyLFwB3kEHKz1M=
307309
go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
308310
go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
309311
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.0 h1:PzIubN4/sjByhDRHLviCjJuweBXWFZWhghjg7cS28+M=

tests/integration/v3_lease_test.go

+76
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,14 @@ import (
2222
"testing"
2323
"time"
2424

25+
"github.com/stretchr/testify/assert"
26+
"github.com/stretchr/testify/require"
2527
pb "go.etcd.io/etcd/api/v3/etcdserverpb"
2628
"go.etcd.io/etcd/api/v3/mvccpb"
2729
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
2830
"go.etcd.io/etcd/client/pkg/v3/testutil"
31+
clientv3 "go.etcd.io/etcd/client/v3"
32+
gofail "go.etcd.io/gofail/runtime"
2933

3034
"google.golang.org/grpc/codes"
3135
"google.golang.org/grpc/metadata"
@@ -1056,6 +1060,78 @@ func TestV3LeaseRecoverKeyWithMutipleLease(t *testing.T) {
10561060
}
10571061
}
10581062

1063+
func TestV3LeaseTimeToLiveWithLeaderChanged(t *testing.T) {
1064+
t.Run("normal", func(subT *testing.T) {
1065+
testV3LeaseTimeToLiveWithLeaderChanged(subT, "beforeLookupWhenLeaseTimeToLive")
1066+
})
1067+
1068+
t.Run("forward", func(subT *testing.T) {
1069+
testV3LeaseTimeToLiveWithLeaderChanged(subT, "beforeLookupWhenForwardLeaseTimeToLive")
1070+
})
1071+
}
1072+
1073+
func testV3LeaseTimeToLiveWithLeaderChanged(t *testing.T, fpName string) {
1074+
if len(gofail.List()) == 0 {
1075+
t.Skip("please run 'make gofail-enable' before running the test")
1076+
}
1077+
1078+
BeforeTest(t)
1079+
1080+
clus := NewClusterV3(t, &ClusterConfig{Size: 3})
1081+
defer clus.Terminate(t)
1082+
1083+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
1084+
defer cancel()
1085+
1086+
oldLeadIdx := clus.WaitLeader(t)
1087+
followerIdx := (oldLeadIdx + 1) % 3
1088+
1089+
followerMemberID := clus.Members[followerIdx].ID()
1090+
1091+
oldLeadC := clus.Client(oldLeadIdx)
1092+
1093+
leaseResp, err := oldLeadC.Grant(ctx, 100)
1094+
require.NoError(t, err)
1095+
1096+
require.NoError(t, gofail.Enable(fpName, `sleep("3s")`))
1097+
t.Cleanup(func() {
1098+
terr := gofail.Disable(fpName)
1099+
if terr != nil && terr != gofail.ErrDisabled {
1100+
t.Fatalf("failed to disable %s: %v", fpName, terr)
1101+
}
1102+
})
1103+
1104+
readyCh := make(chan struct{})
1105+
errCh := make(chan error, 1)
1106+
1107+
var targetC *clientv3.Client
1108+
switch fpName {
1109+
case "beforeLookupWhenLeaseTimeToLive":
1110+
targetC = oldLeadC
1111+
case "beforeLookupWhenForwardLeaseTimeToLive":
1112+
targetC = clus.Client((oldLeadIdx + 2) % 3)
1113+
default:
1114+
t.Fatalf("unsupported %s failpoint", fpName)
1115+
}
1116+
1117+
go func() {
1118+
<-readyCh
1119+
time.Sleep(1 * time.Second)
1120+
1121+
_, merr := oldLeadC.MoveLeader(ctx, uint64(followerMemberID))
1122+
assert.NoError(t, gofail.Disable(fpName))
1123+
errCh <- merr
1124+
}()
1125+
1126+
close(readyCh)
1127+
1128+
ttlResp, err := targetC.TimeToLive(ctx, leaseResp.ID)
1129+
require.NoError(t, err)
1130+
require.GreaterOrEqual(t, int64(100), ttlResp.TTL)
1131+
1132+
require.NoError(t, <-errCh)
1133+
}
1134+
10591135
// acquireLeaseAndKey creates a new lease and creates an attached key.
10601136
func acquireLeaseAndKey(clus *ClusterV3, key string) (int64, error) {
10611137
// create lease

0 commit comments

Comments
 (0)