Skip to content

Commit fd0e055

Browse files
committed
Add state to snapshot create and configurable retry logic
Signed-off-by: Grant Griffiths <[email protected]>
1 parent 89889f0 commit fd0e055

File tree

7 files changed

+172
-47
lines changed

7 files changed

+172
-47
lines changed

cmd/csi-snapshotter/main.go

+4
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ var (
6262
csiAddress = flag.String("csi-address", "/run/csi/socket", "Address of the CSI driver socket.")
6363
createSnapshotContentRetryCount = flag.Int("create-snapshotcontent-retrycount", 5, "Number of retries when we create a snapshot content object for a snapshot.")
6464
createSnapshotContentInterval = flag.Duration("create-snapshotcontent-interval", 10*time.Second, "Interval between retries when we create a snapshot content object for a snapshot.")
65+
retryIntervalStart = flag.Duration("retry-interval-start", time.Second, "Initial retry interval of failed snapshot creation. It doubles with each failure, up to retry-interval-max.")
66+
retryIntervalMax = flag.Duration("retry-interval-max", 5*time.Minute, "Maximum retry interval of failed snapshot creation.")
6567
resyncPeriod = flag.Duration("resync-period", 60*time.Second, "Resync interval of the controller.")
6668
snapshotNamePrefix = flag.String("snapshot-name-prefix", "snapshot", "Prefix to apply to the name of a created snapshot")
6769
snapshotNameUUIDLength = flag.Int("snapshot-name-uuid-length", -1, "Length in characters for the generated uuid of a created snapshot. Defaults behavior is to NOT truncate.")
@@ -176,6 +178,8 @@ func main() {
176178
coreFactory.Core().V1().PersistentVolumeClaims(),
177179
*createSnapshotContentRetryCount,
178180
*createSnapshotContentInterval,
181+
*retryIntervalStart,
182+
*retryIntervalMax,
179183
snapShotter,
180184
*csiTimeout,
181185
*resyncPeriod,

pkg/controller/csi_handler.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ import (
3030

3131
// Handler is responsible for handling VolumeSnapshot events from informer.
3232
type Handler interface {
33-
CreateSnapshot(snapshot *crdv1.VolumeSnapshot, volume *v1.PersistentVolume, parameters map[string]string, snapshotterCredentials map[string]string) (string, string, time.Time, int64, bool, error)
33+
CreateSnapshot(snapshot *crdv1.VolumeSnapshot, volume *v1.PersistentVolume, parameters map[string]string, snapshotterCredentials map[string]string) (string, string, time.Time, int64, bool, snapshotter.SnapshottingState, error)
3434
DeleteSnapshot(content *crdv1.VolumeSnapshotContent, snapshotterCredentials map[string]string) error
3535
GetSnapshotStatus(content *crdv1.VolumeSnapshotContent) (bool, time.Time, int64, error)
3636
}
@@ -58,19 +58,20 @@ func NewCSIHandler(
5858
}
5959
}
6060

61-
func (handler *csiHandler) CreateSnapshot(snapshot *crdv1.VolumeSnapshot, volume *v1.PersistentVolume, parameters map[string]string, snapshotterCredentials map[string]string) (string, string, time.Time, int64, bool, error) {
61+
func (handler *csiHandler) CreateSnapshot(snapshot *crdv1.VolumeSnapshot, volume *v1.PersistentVolume, parameters map[string]string, snapshotterCredentials map[string]string) (string, string, time.Time, int64, bool, snapshotter.SnapshottingState, error) {
6262

6363
ctx, cancel := context.WithTimeout(context.Background(), handler.timeout)
6464
defer cancel()
6565

6666
snapshotName, err := makeSnapshotName(handler.snapshotNamePrefix, string(snapshot.UID), handler.snapshotNameUUIDLength)
6767
if err != nil {
68-
return "", "", time.Time{}, 0, false, err
68+
return "", "", time.Time{}, 0, false, snapshotter.SnapshottingFinished, err
6969
}
7070
newParameters, err := removePrefixedParameters(parameters)
7171
if err != nil {
72-
return "", "", time.Time{}, 0, false, fmt.Errorf("failed to remove CSI Parameters of prefixed keys: %v", err)
72+
return "", "", time.Time{}, 0, false, snapshotter.SnapshottingFinished, fmt.Errorf("failed to remove CSI Parameters of prefixed keys: %v", err)
7373
}
74+
7475
return handler.snapshotter.CreateSnapshot(ctx, snapshotName, volume, newParameters, snapshotterCredentials)
7576
}
7677

pkg/controller/framework_test.go

+8-4
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
snapshotscheme "github.com/kubernetes-csi/external-snapshotter/pkg/client/clientset/versioned/scheme"
3636
informers "github.com/kubernetes-csi/external-snapshotter/pkg/client/informers/externalversions"
3737
storagelisters "github.com/kubernetes-csi/external-snapshotter/pkg/client/listers/volumesnapshot/v1beta1"
38+
"github.com/kubernetes-csi/external-snapshotter/pkg/snapshotter"
3839
v1 "k8s.io/api/core/v1"
3940
storagev1 "k8s.io/api/storage/v1"
4041
"k8s.io/apimachinery/pkg/api/resource"
@@ -757,6 +758,8 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte
757758
coreFactory.Core().V1().PersistentVolumeClaims(),
758759
3,
759760
5*time.Millisecond,
761+
5*time.Millisecond,
762+
10*time.Second,
760763
fakeSnapshot,
761764
5*time.Millisecond,
762765
60*time.Second,
@@ -1372,10 +1375,10 @@ type fakeSnapshotter struct {
13721375
t *testing.T
13731376
}
13741377

1375-
func (f *fakeSnapshotter) CreateSnapshot(ctx context.Context, snapshotName string, volume *v1.PersistentVolume, parameters map[string]string, snapshotterCredentials map[string]string) (string, string, time.Time, int64, bool, error) {
1378+
func (f *fakeSnapshotter) CreateSnapshot(ctx context.Context, snapshotName string, volume *v1.PersistentVolume, parameters map[string]string, snapshotterCredentials map[string]string) (string, string, time.Time, int64, bool, snapshotter.SnapshottingState, error) {
13761379
if f.createCallCounter >= len(f.createCalls) {
13771380
f.t.Errorf("Unexpected CSI Create Snapshot call: snapshotName=%s, volume=%v, index: %d, calls: %+v", snapshotName, volume.Name, f.createCallCounter, f.createCalls)
1378-
return "", "", time.Time{}, 0, false, fmt.Errorf("unexpected call")
1381+
return "", "", time.Time{}, 0, false, snapshotter.SnapshottingFinished, fmt.Errorf("unexpected call")
13791382
}
13801383
call := f.createCalls[f.createCallCounter]
13811384
f.createCallCounter++
@@ -1402,9 +1405,10 @@ func (f *fakeSnapshotter) CreateSnapshot(ctx context.Context, snapshotName strin
14021405
}
14031406

14041407
if err != nil {
1405-
return "", "", time.Time{}, 0, false, fmt.Errorf("unexpected call")
1408+
return "", "", time.Time{}, 0, false, snapshotter.SnapshottingFinished, fmt.Errorf("unexpected call")
14061409
}
1407-
return call.driverName, call.snapshotId, call.creationTime, call.size, call.readyToUse, call.err
1410+
1411+
return call.driverName, call.snapshotId, call.creationTime, call.size, call.readyToUse, snapshotter.SnapshottingFinished, call.err
14081412
}
14091413

14101414
func (f *fakeSnapshotter) DeleteSnapshot(ctx context.Context, snapshotID string, snapshotterCredentials map[string]string) error {

pkg/controller/snapshot_controller.go

+37-19
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,8 @@ package controller
1818

1919
import (
2020
"fmt"
21-
"strings"
22-
"time"
23-
2421
crdv1 "github.com/kubernetes-csi/external-snapshotter/pkg/apis/volumesnapshot/v1beta1"
22+
"github.com/kubernetes-csi/external-snapshotter/pkg/snapshotter"
2523
v1 "k8s.io/api/core/v1"
2624
storagev1 "k8s.io/api/storage/v1"
2725
apierrs "k8s.io/apimachinery/pkg/api/errors"
@@ -34,6 +32,8 @@ import (
3432
"k8s.io/kubernetes/pkg/util/goroutinemap"
3533
"k8s.io/kubernetes/pkg/util/goroutinemap/exponentialbackoff"
3634
"k8s.io/kubernetes/pkg/util/slice"
35+
"strings"
36+
"time"
3737
)
3838

3939
// ==================================================================
@@ -363,20 +363,37 @@ func (ctrl *csiSnapshotController) storeContentUpdate(content interface{}) (bool
363363

364364
// createSnapshot starts new asynchronous operation to create snapshot
365365
func (ctrl *csiSnapshotController) createSnapshot(snapshot *crdv1.VolumeSnapshot) error {
366-
klog.V(5).Infof("createSnapshot[%s]: started", snapshotKey(snapshot))
367-
opName := fmt.Sprintf("create-%s[%s]", snapshotKey(snapshot), string(snapshot.UID))
366+
key := snapshotKey(snapshot)
367+
klog.V(5).Infof("createSnapshot[%s]: started", key)
368+
opName := fmt.Sprintf("create-%s[%s]", key, string(snapshot.UID))
368369
ctrl.scheduleOperation(opName, func() error {
369-
snapshotObj, err := ctrl.createSnapshotOperation(snapshot)
370+
snapshotObj, state, err := ctrl.createSnapshotOperation(snapshot)
370371
if err != nil {
371372
ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotCreationFailed", fmt.Sprintf("Failed to create snapshot: %v", err))
372373
klog.Errorf("createSnapshot [%s]: error occurred in createSnapshotOperation: %v", opName, err)
374+
375+
// Handle state:
376+
if state == snapshotter.SnapshottingFinished {
377+
// Snapshotting finished, remove obj from snapshotsInProgress.
378+
ctrl.snapshotsInProgress.Delete(key)
379+
} else if state == snapshotter.SnapshottingInBackground {
380+
// Snapshotting still in progress.
381+
klog.V(4).Infof("createSnapshot [%s]: Temporary error received, adding Snapshot back in queue: %v", key, err)
382+
ctrl.snapshotsInProgress.Store(key, snapshotObj)
383+
} else {
384+
// State is SnapshottingNoChange. Don't change snapshotsInProgress.
385+
}
386+
373387
return err
374388
}
389+
390+
// If no errors, update the snapshot.
375391
_, updateErr := ctrl.storeSnapshotUpdate(snapshotObj)
376392
if updateErr != nil {
377393
// We will get an "snapshot update" event soon, this is not a big error
378-
klog.V(4).Infof("createSnapshot [%s]: cannot update internal cache: %v", snapshotKey(snapshotObj), updateErr)
394+
klog.V(4).Infof("createSnapshot [%s]: cannot update internal cache: %v", key, updateErr)
379395
}
396+
380397
return nil
381398
})
382399
return nil
@@ -588,7 +605,7 @@ func (ctrl *csiSnapshotController) checkandUpdateBoundSnapshotStatusOperation(sn
588605
if err != nil {
589606
return nil, err
590607
}
591-
driverName, snapshotID, creationTime, size, readyToUse, err = ctrl.handler.CreateSnapshot(snapshot, volume, class.Parameters, snapshotterCredentials)
608+
driverName, snapshotID, creationTime, size, readyToUse, _, err = ctrl.handler.CreateSnapshot(snapshot, volume, class.Parameters, snapshotterCredentials)
592609
if err != nil {
593610
klog.Errorf("checkandUpdateBoundSnapshotStatusOperation: failed to call create snapshot to check whether the snapshot is ready to use %q", err)
594611
return nil, err
@@ -622,35 +639,35 @@ func (ctrl *csiSnapshotController) checkandUpdateBoundSnapshotStatusOperation(sn
622639
// 2. Update VolumeSnapshot status with creationtimestamp information
623640
// 3. Create the VolumeSnapshotContent object with the snapshot id information.
624641
// 4. Bind the VolumeSnapshot and VolumeSnapshotContent object
625-
func (ctrl *csiSnapshotController) createSnapshotOperation(snapshot *crdv1.VolumeSnapshot) (*crdv1.VolumeSnapshot, error) {
642+
func (ctrl *csiSnapshotController) createSnapshotOperation(snapshot *crdv1.VolumeSnapshot) (*crdv1.VolumeSnapshot, snapshotter.SnapshottingState, error) {
626643
klog.Infof("createSnapshot: Creating snapshot %s through the plugin ...", snapshotKey(snapshot))
627644

628645
if snapshot.Status != nil && snapshot.Status.Error != nil && snapshot.Status.Error.Message != nil && !isControllerUpdateFailError(snapshot.Status.Error) {
629646
klog.V(4).Infof("error is already set in snapshot, do not retry to create: %s", *snapshot.Status.Error.Message)
630-
return snapshot, nil
647+
return snapshot, snapshotter.SnapshottingNoChange, nil
631648
}
632649

633650
// If PVC is not being deleted and finalizer is not added yet, a finalizer should be added.
634651
klog.V(5).Infof("createSnapshotOperation: Check if PVC is not being deleted and add Finalizer for source of snapshot [%s] if needed", snapshot.Name)
635652
err := ctrl.ensureSnapshotSourceFinalizer(snapshot)
636653
if err != nil {
637654
klog.Errorf("createSnapshotOperation failed to add finalizer for source of snapshot %s", err)
638-
return nil, err
655+
return nil, snapshotter.SnapshottingNoChange, err
639656
}
640657

641658
class, volume, contentName, snapshotterSecretRef, err := ctrl.getCreateSnapshotInput(snapshot)
642659
if err != nil {
643-
return nil, fmt.Errorf("failed to get input parameters to create snapshot %s: %q", snapshot.Name, err)
660+
return nil, snapshotter.SnapshottingNoChange, fmt.Errorf("failed to get input parameters to create snapshot %s: %q", snapshot.Name, err)
644661
}
645662

646663
snapshotterCredentials, err := getCredentials(ctrl.client, snapshotterSecretRef)
647664
if err != nil {
648-
return nil, err
665+
return nil, snapshotter.SnapshottingNoChange, err
649666
}
650667

651-
driverName, snapshotID, creationTime, size, readyToUse, err := ctrl.handler.CreateSnapshot(snapshot, volume, class.Parameters, snapshotterCredentials)
668+
driverName, snapshotID, creationTime, size, readyToUse, state, err := ctrl.handler.CreateSnapshot(snapshot, volume, class.Parameters, snapshotterCredentials)
652669
if err != nil {
653-
return nil, fmt.Errorf("failed to take snapshot of the volume, %s: %q", volume.Name, err)
670+
return nil, state, fmt.Errorf("failed to take snapshot of the volume, %s: %q", volume.Name, err)
654671
}
655672

656673
klog.V(5).Infof("Created snapshot: driver %s, snapshotId %s, creationTime %v, size %d, readyToUse %t", driverName, snapshotID, creationTime, size, readyToUse)
@@ -667,12 +684,12 @@ func (ctrl *csiSnapshotController) createSnapshotOperation(snapshot *crdv1.Volum
667684
}
668685

669686
if err != nil {
670-
return nil, err
687+
return nil, snapshotter.SnapshottingInBackground, err
671688
}
672689
// Create VolumeSnapshotContent in the database
673690
snapshotRef, err := ref.GetReference(scheme.Scheme, snapshot)
674691
if err != nil {
675-
return nil, err
692+
return nil, snapshotter.SnapshottingInBackground, err
676693
}
677694

678695
timestamp := creationTime.UnixNano()
@@ -730,9 +747,10 @@ func (ctrl *csiSnapshotController) createSnapshotOperation(snapshot *crdv1.Volum
730747
strerr := fmt.Sprintf("Error creating volume snapshot content object for snapshot %s: %v.", snapshotKey(snapshot), err)
731748
klog.Error(strerr)
732749
ctrl.eventRecorder.Event(newSnapshot, v1.EventTypeWarning, "CreateSnapshotContentFailed", strerr)
733-
return nil, newControllerUpdateError(snapshotKey(snapshot), err.Error())
750+
return nil, snapshotter.SnapshottingInBackground, newControllerUpdateError(snapshotKey(snapshot), err.Error())
734751
}
735-
return newSnapshot, nil
752+
753+
return newSnapshot, snapshotter.SnapshottingFinished, nil
736754
}
737755

738756
// Delete a snapshot

pkg/controller/snapshot_controller_base.go

+52-12
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package controller
1818

1919
import (
2020
"fmt"
21+
"sync"
2122
"time"
2223

2324
crdv1 "github.com/kubernetes-csi/external-snapshotter/pkg/apis/volumesnapshot/v1beta1"
@@ -50,6 +51,9 @@ type csiSnapshotController struct {
5051
snapshotQueue workqueue.RateLimitingInterface
5152
contentQueue workqueue.RateLimitingInterface
5253

54+
// Map UID -> *Snapshot with all snapshots in progress in the background.
55+
snapshotsInProgress sync.Map
56+
5357
snapshotLister storagelisters.VolumeSnapshotLister
5458
snapshotListerSynced cache.InformerSynced
5559
contentLister storagelisters.VolumeSnapshotContentLister
@@ -68,6 +72,8 @@ type csiSnapshotController struct {
6872

6973
createSnapshotContentRetryCount int
7074
createSnapshotContentInterval time.Duration
75+
retryIntervalStart time.Duration
76+
retryIntervalMax time.Duration
7177
resyncPeriod time.Duration
7278
}
7379

@@ -82,6 +88,8 @@ func NewCSISnapshotController(
8288
pvcInformer coreinformers.PersistentVolumeClaimInformer,
8389
createSnapshotContentRetryCount int,
8490
createSnapshotContentInterval time.Duration,
91+
retryIntervalStart time.Duration,
92+
retryIntervalMax time.Duration,
8593
snapshotter snapshotter.Snapshotter,
8694
timeout time.Duration,
8795
resyncPeriod time.Duration,
@@ -103,10 +111,12 @@ func NewCSISnapshotController(
103111
runningOperations: goroutinemap.NewGoRoutineMap(true),
104112
createSnapshotContentRetryCount: createSnapshotContentRetryCount,
105113
createSnapshotContentInterval: createSnapshotContentInterval,
114+
retryIntervalStart: retryIntervalStart,
115+
retryIntervalMax: retryIntervalMax,
106116
resyncPeriod: resyncPeriod,
107117
snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
108118
contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
109-
snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "csi-snapshotter-snapshot"),
119+
snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter(retryIntervalStart, retryIntervalMax), "csi-snapshotter-snapshot"),
110120
contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "csi-snapshotter-content"),
111121
}
112122

@@ -215,22 +225,38 @@ func (ctrl *csiSnapshotController) snapshotWorker() {
215225
klog.Errorf("error getting namespace & name of snapshot %q to get snapshot from informer: %v", key, err)
216226
return false
217227
}
218-
snapshot, err := ctrl.snapshotLister.VolumeSnapshots(namespace).Get(name)
219-
if err == nil {
220-
// The volume snapshot still exists in informer cache, the event must have
221-
// been add/update/sync
228+
229+
// Attempt to get snapshot from the informer
230+
var snapshot *crdv1.VolumeSnapshot
231+
snapshot, err = ctrl.snapshotLister.VolumeSnapshots(namespace).Get(name)
232+
if err != nil && !errors.IsNotFound(err) {
233+
klog.V(2).Infof("error getting snapshot %q from informer: %v", key, err)
234+
return false
235+
} else if errors.IsNotFound(err) {
236+
// Check snapshotsInProgress for the snapshot if not found from the informer
237+
inProgressObj, ok := ctrl.snapshotsInProgress.Load(key)
238+
if ok {
239+
snapshot, ok = inProgressObj.(*crdv1.VolumeSnapshot)
240+
if !ok {
241+
klog.Errorf("expected vs, got %+v", inProgressObj)
242+
return false
243+
}
244+
}
245+
246+
}
247+
248+
if snapshot != nil {
249+
// If the volume snapshot still exists in informer cache, the event must have
250+
// been add/update/sync. Otherwise, the volume snapshot was still in progress.
222251
newSnapshot, err := ctrl.checkAndUpdateSnapshotClass(snapshot)
223252
if err == nil {
224253
klog.V(5).Infof("passed checkAndUpdateSnapshotClass for snapshot %q", key)
225254
ctrl.updateSnapshot(newSnapshot)
226255
}
227256
return false
228257
}
229-
if err != nil && !errors.IsNotFound(err) {
230-
klog.V(2).Infof("error getting snapshot %q from informer: %v", key, err)
231-
return false
232-
}
233-
// The snapshot is not in informer cache, the event must have been "delete"
258+
259+
// The snapshot is not in informer cache or in progress, the event must have been "delete"
234260
vsObj, found, err := ctrl.snapshotStore.GetByKey(key)
235261
if err != nil {
236262
klog.V(2).Infof("error getting snapshot %q from cache: %v", key, err)
@@ -251,6 +277,10 @@ func (ctrl *csiSnapshotController) snapshotWorker() {
251277
if err == nil {
252278
ctrl.deleteSnapshot(newSnapshot)
253279
}
280+
281+
ctrl.snapshotQueue.Forget(keyObj)
282+
ctrl.snapshotsInProgress.Delete(key)
283+
254284
return false
255285
}
256286

@@ -377,12 +407,22 @@ func (ctrl *csiSnapshotController) updateSnapshot(snapshot *crdv1.VolumeSnapshot
377407
}
378408
err = ctrl.syncSnapshot(snapshot)
379409
if err != nil {
410+
sKey := snapshotKey(snapshot)
411+
412+
// if the snapshot has been deleted, remove from snapshots in progress
413+
if _, exists, _ := ctrl.snapshotStore.Get(sKey); !exists {
414+
ctrl.snapshotsInProgress.Delete(sKey)
415+
} else {
416+
// otherwise, add back to the snapshot queue to retry.
417+
ctrl.snapshotQueue.AddRateLimited(sKey)
418+
}
419+
380420
if errors.IsConflict(err) {
381421
// Version conflict error happens quite often and the controller
382422
// recovers from it easily.
383-
klog.V(3).Infof("could not sync claim %q: %+v", snapshotKey(snapshot), err)
423+
klog.V(3).Infof("could not sync claim %q: %+v", sKey, err)
384424
} else {
385-
klog.Errorf("could not sync volume %q: %+v", snapshotKey(snapshot), err)
425+
klog.Errorf("could not sync volume %q: %+v", sKey, err)
386426
}
387427
}
388428
}

0 commit comments

Comments
 (0)