Skip to content

Commit cccaf3c

Browse files
committed
UPSTREAM: <carry>: add audit annotations to track etcd state
1 parent 309f240 commit cccaf3c

File tree

1 file changed

+22
-0
lines changed

1 file changed

+22
-0
lines changed

staging/src/k8s.io/apiserver/pkg/storage/etcd3/etcd3retry/retry_etcdclient.go

+22
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ package etcd3retry
22

33
import (
44
"context"
5+
"k8s.io/apiserver/pkg/audit"
6+
"strings"
57
"time"
68

79
etcdrpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
@@ -32,11 +34,18 @@ func NewRetryingEtcdStorage(delegate storage.Interface) storage.Interface {
3234
return &retryClient{Interface: delegate}
3335
}
3436

37+
func addEtcdAccessAuditAnnotation(ctx context.Context) {
38+
// add an audit annotation indicating we reached out to etcd. This allows our post-processing to exclude requests
39+
// that don't attempt to access etcd from, "how reliably is etcd" calculations.
40+
audit.AddAuditAnnotation(ctx, "apiserver.internal.openshift.io/etcd-access", time.Now().Format(time.RFC3339))
41+
}
42+
3543
// Create adds a new object at a key unless it already exists. 'ttl' is time-to-live
3644
// in seconds (0 means forever). If no error is returned and out is not nil, out will be
3745
// set to the read value from database.
3846
func (c *retryClient) Create(ctx context.Context, key string, obj, out runtime.Object, ttl uint64) error {
3947
return OnError(ctx, DefaultRetry, IsRetriableEtcdError, func() error {
48+
addEtcdAccessAuditAnnotation(ctx)
4049
return c.Interface.Create(ctx, key, obj, out, ttl)
4150
})
4251
}
@@ -45,6 +54,7 @@ func (c *retryClient) Create(ctx context.Context, key string, obj, out runtime.O
4554
// If key didn't exist, it will return NotFound storage error.
4655
func (c *retryClient) Delete(ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions, validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error {
4756
return OnError(ctx, DefaultRetry, IsRetriableEtcdError, func() error {
57+
addEtcdAccessAuditAnnotation(ctx)
4858
return c.Interface.Delete(ctx, key, out, preconditions, validateDeletion, cachedExistingObject)
4959
})
5060
}
@@ -59,6 +69,7 @@ func (c *retryClient) Delete(ctx context.Context, key string, out runtime.Object
5969
func (c *retryClient) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) {
6070
var ret watch.Interface
6171
err := OnError(ctx, DefaultRetry, IsRetriableEtcdError, func() error {
72+
addEtcdAccessAuditAnnotation(ctx)
6273
var innerErr error
6374
ret, innerErr = c.Interface.Watch(ctx, key, opts)
6475
return innerErr
@@ -73,6 +84,7 @@ func (c *retryClient) Watch(ctx context.Context, key string, opts storage.ListOp
7384
// match 'opts.ResourceVersion' according 'opts.ResourceVersionMatch'.
7485
func (c *retryClient) Get(ctx context.Context, key string, opts storage.GetOptions, objPtr runtime.Object) error {
7586
return OnError(ctx, DefaultRetry, IsRetriableEtcdError, func() error {
87+
addEtcdAccessAuditAnnotation(ctx)
7688
return c.Interface.Get(ctx, key, opts, objPtr)
7789
})
7890
}
@@ -85,6 +97,7 @@ func (c *retryClient) Get(ctx context.Context, key string, opts storage.GetOptio
8597
// match 'opts.ResourceVersion' according 'opts.ResourceVersionMatch'.
8698
func (c *retryClient) GetList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error {
8799
return OnError(ctx, DefaultRetry, IsRetriableEtcdError, func() error {
100+
addEtcdAccessAuditAnnotation(ctx)
88101
return c.Interface.GetList(ctx, key, opts, listObj)
89102
})
90103
}
@@ -126,6 +139,7 @@ func (c *retryClient) GetList(ctx context.Context, key string, opts storage.List
126139
func (c *retryClient) GuaranteedUpdate(ctx context.Context, key string, destination runtime.Object, ignoreNotFound bool,
127140
preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, cachedExistingObject runtime.Object) error {
128141
return OnError(ctx, DefaultRetry, IsRetriableEtcdError, func() error {
142+
addEtcdAccessAuditAnnotation(ctx)
129143
return c.Interface.GuaranteedUpdate(ctx, key, destination, ignoreNotFound, preconditions, tryUpdate, cachedExistingObject)
130144
})
131145
}
@@ -153,6 +167,8 @@ func OnError(ctx context.Context, backoff wait.Backoff, retriable func(error) (s
153167
var retry bool
154168
var retryCounter int
155169
err := backoffWithRequestContext(ctx, backoff, func() (bool, error) {
170+
startTime := time.Now()
171+
156172
err := fn()
157173
if retry {
158174
klog.V(1).Infof("etcd retry - counter: %v, lastErrLabel: %s lastError: %v, error: %v", retryCounter, lastErrLabel, lastErr, err)
@@ -162,6 +178,12 @@ func OnError(ctx context.Context, backoff wait.Backoff, retriable func(error) (s
162178
return true, nil
163179
}
164180

181+
// add an audit annotation if we hit a no leader condition so we can track this failure in post-processing CI steps.
182+
// We only mark the first time through. Hopefully there's enough traffic that it doesn't matter
183+
if strings.Contains(err.Error(), "no leader") {
184+
audit.AddAuditAnnotation(ctx, "apiserver.internal.openshift.io/no-leader", startTime.Format(time.RFC3339))
185+
}
186+
165187
lastErrLabel, retry = retriable(err)
166188
if retry {
167189
lastErr = err

0 commit comments

Comments
 (0)