@@ -23,6 +23,8 @@ import (
23
23
24
24
crdv1 "github.com/kubernetes-csi/external-snapshotter/v2/pkg/apis/volumesnapshot/v1beta1"
25
25
"github.com/kubernetes-csi/external-snapshotter/v2/pkg/utils"
26
+ codes "google.golang.org/grpc/codes"
27
+ "google.golang.org/grpc/status"
26
28
v1 "k8s.io/api/core/v1"
27
29
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28
30
"k8s.io/klog"
@@ -81,6 +83,11 @@ func (ctrl *csiSnapshotSideCarController) syncContent(content *crdv1.VolumeSnaps
81
83
// or ListSnapshots CSI methods over and over again for
82
84
// performance reasons.
83
85
if content .Status != nil && content .Status .ReadyToUse != nil && * content .Status .ReadyToUse == true {
86
+ // Try to remove AnnVolumeSnapshotBeingCreated if it is not removed yet for some reason
87
+ err := ctrl .removeAnnVolumeSnapshotBeingCreated (content )
88
+ if err != nil {
89
+ return fmt .Errorf ("failed to remove VolumeSnapshotBeingCreated annotation from the content %s: %q" , content .Name , err )
90
+ }
84
91
return nil
85
92
}
86
93
ctrl .checkandUpdateContentStatus (content )
@@ -126,10 +133,10 @@ func (ctrl *csiSnapshotSideCarController) createSnapshot(content *crdv1.VolumeSn
126
133
klog .V (5 ).Infof ("createSnapshot for content [%s]: started" , content .Name )
127
134
opName := fmt .Sprintf ("create-%s" , content .Name )
128
135
ctrl .scheduleOperation (opName , func () error {
129
- contentObj , err := ctrl .createSnapshotOperation (content )
136
+ contentObj , err := ctrl .createSnapshotWrapper (content )
130
137
if err != nil {
131
138
ctrl .updateContentErrorStatusWithEvent (content , v1 .EventTypeWarning , "SnapshotCreationFailed" , fmt .Sprintf ("Failed to create snapshot: %v" , err ))
132
- klog .Errorf ("createSnapshot [%s]: error occurred in createSnapshotOperation : %v" , opName , err )
139
+ klog .Errorf ("createSnapshot [%s]: error occurred in createSnapshotWrapper : %v" , opName , err )
133
140
return err
134
141
}
135
142
@@ -276,75 +283,80 @@ func (ctrl *csiSnapshotSideCarController) checkandUpdateContentStatusOperation(c
276
283
}
277
284
driverName = content .Spec .Driver
278
285
snapshotID = * content .Spec .Source .SnapshotHandle
279
- } else {
280
- class , snapshotterCredentials , err := ctrl .getCSISnapshotInput (content )
281
- if err != nil {
282
- return nil , fmt .Errorf ("failed to get input parameters to create snapshot %s: %q" , content .Name , err )
286
+
287
+ klog .V (5 ).Infof ("checkandUpdateContentStatusOperation: driver %s, snapshotId %s, creationTime %v, size %d, readyToUse %t" , driverName , snapshotID , creationTime , size , readyToUse )
288
+
289
+ if creationTime .IsZero () {
290
+ creationTime = time .Now ()
283
291
}
284
292
285
- driverName , snapshotID , creationTime , size , readyToUse , err = ctrl .handler . CreateSnapshot (content , class . Parameters , snapshotterCredentials )
293
+ updatedContent , err : = ctrl .updateSnapshotContentStatus (content , snapshotID , readyToUse , creationTime . UnixNano (), size )
286
294
if err != nil {
287
- klog .Errorf ("checkandUpdateContentStatusOperation: failed to call create snapshot to check whether the snapshot is ready to use %q" , err )
288
295
return nil , err
289
296
}
297
+ return updatedContent , nil
298
+ } else {
299
+ return ctrl .createSnapshotWrapper (content )
290
300
}
291
- klog .V (5 ).Infof ("checkandUpdateContentStatusOperation: driver %s, snapshotId %s, creationTime %v, size %d, readyToUse %t" , driverName , snapshotID , creationTime , size , readyToUse )
292
-
293
- if creationTime .IsZero () {
294
- creationTime = time .Now ()
295
- }
296
-
297
- updateContent , err := ctrl .updateSnapshotContentStatus (content , snapshotID , readyToUse , creationTime .UnixNano (), size )
298
- if err != nil {
299
- return nil , err
300
- }
301
- return updateContent , nil
302
301
}
303
302
304
- // The function goes through the whole snapshot creation process.
305
- // 1. Trigger the snapshot through csi storage provider.
306
- // 2. Update VolumeSnapshot status with creationtimestamp information
307
- // 3. Create the VolumeSnapshotContent object with the snapshot id information.
308
- // 4. Bind the VolumeSnapshot and VolumeSnapshotContent object
309
- func (ctrl * csiSnapshotSideCarController ) createSnapshotOperation (content * crdv1.VolumeSnapshotContent ) (* crdv1.VolumeSnapshotContent , error ) {
310
- klog .Infof ("createSnapshotOperation: Creating snapshot for content %s through the plugin ..." , content .Name )
311
-
312
- // content.Status will be created for the first time after a snapshot
313
- // is created by the CSI driver. If content.Status is not nil,
314
- // we should update content status without creating snapshot again.
315
- if content .Status != nil && content .Status .Error != nil && content .Status .Error .Message != nil && ! isControllerUpdateFailError (content .Status .Error ) {
316
- klog .V (4 ).Infof ("error is already set in snapshot, do not retry to create: %s" , * content .Status .Error .Message )
317
- return content , nil
318
- }
303
+ // This is a wrapper function for the snapshot creation process.
304
+ func (ctrl * csiSnapshotSideCarController ) createSnapshotWrapper (content * crdv1.VolumeSnapshotContent ) (* crdv1.VolumeSnapshotContent , error ) {
305
+ klog .Infof ("createSnapshotWrapper: Creating snapshot for content %s through the plugin ..." , content .Name )
319
306
320
307
class , snapshotterCredentials , err := ctrl .getCSISnapshotInput (content )
321
308
if err != nil {
322
309
return nil , fmt .Errorf ("failed to get input parameters to create snapshot for content %s: %q" , content .Name , err )
323
310
}
324
311
312
+ // NOTE(xyang): handle create timeout
313
+ // Add an annotation to indicate the snapshot creation request has been
314
+ // sent to the storage system and the controller is waiting for a response.
315
+ // The annotation will be removed after the storage system has responded with
316
+ // success or permanent failure. If the request times out, annotation will
317
+ // remain on the content to avoid potential leaking of a snapshot resource on
318
+ // the storage system.
319
+ err = ctrl .setAnnVolumeSnapshotBeingCreated (content )
320
+ if err != nil {
321
+ return nil , fmt .Errorf ("failed to add VolumeSnapshotBeingCreated annotation on the content %s: %q" , content .Name , err )
322
+ }
323
+
325
324
driverName , snapshotID , creationTime , size , readyToUse , err := ctrl .handler .CreateSnapshot (content , class .Parameters , snapshotterCredentials )
326
325
if err != nil {
326
+ // NOTE(xyang): handle create timeout
327
+ // If it is a final error, remove annotation to indicate
328
+ // storage system has responded with an error
329
+ klog .Infof ("createSnapshotWrapper: CreateSnapshot for content %s returned error: %v" , content .Name , err )
330
+ if isCSIFinalError (err ) {
331
+ err = ctrl .removeAnnVolumeSnapshotBeingCreated (content )
332
+ if err != nil {
333
+ return nil , fmt .Errorf ("failed to remove VolumeSnapshotBeingCreated annotation from the content %s: %q" , content .Name , err )
334
+ }
335
+ }
336
+
327
337
return nil , fmt .Errorf ("failed to take snapshot of the volume, %s: %q" , * content .Spec .Source .VolumeHandle , err )
328
338
}
329
- if driverName != class .Driver {
330
- return nil , fmt .Errorf ("failed to take snapshot of the volume, %s: driver name %s returned from the driver is different from driver %s in snapshot class" , * content .Spec .Source .VolumeHandle , driverName , class .Driver )
331
- }
332
339
333
340
klog .V (5 ).Infof ("Created snapshot: driver %s, snapshotId %s, creationTime %v, size %d, readyToUse %t" , driverName , snapshotID , creationTime , size , readyToUse )
334
341
335
- timestamp := creationTime .UnixNano ()
336
- newContent , err := ctrl .updateSnapshotContentStatus (content , snapshotID , readyToUse , timestamp , size )
342
+ if creationTime .IsZero () {
343
+ creationTime = time .Now ()
344
+ }
345
+
346
+ newContent , err := ctrl .updateSnapshotContentStatus (content , snapshotID , readyToUse , creationTime .UnixNano (), size )
337
347
if err != nil {
338
- strerr := fmt . Sprintf ("error updating volume snapshot content status for snapshot %s: %v." , content .Name , err )
339
- klog . Error ( strerr )
348
+ klog . Errorf ("error updating status for volume snapshot content %s: %v." , content .Name , err )
349
+ return nil , fmt . Errorf ( "error updating status for volume snapshot content %s: %v." , content . Name , err )
340
350
} else {
341
351
content = newContent
342
352
}
343
353
344
- // Update content in the cache store
345
- _ , err = ctrl .storeContentUpdate (content )
354
+ // NOTE(xyang): handle create timeout
355
+ // Remove annotation to indicate storage system has successfully
356
+ // cut the snapshot
357
+ err = ctrl .removeAnnVolumeSnapshotBeingCreated (content )
346
358
if err != nil {
347
- klog .Errorf ("failed to update content store %v" , err )
359
+ return nil , fmt .Errorf ("failed to remove VolumeSnapshotBeingCreated annotation on the content %s: %q" , content . Name , err )
348
360
}
349
361
350
362
return content , nil
@@ -564,9 +576,99 @@ func (ctrl *csiSnapshotSideCarController) shouldDelete(content *crdv1.VolumeSnap
564
576
if content .Spec .Source .SnapshotHandle != nil && content .Spec .VolumeSnapshotRef .UID == "" {
565
577
return true
566
578
}
567
- // 2) shouldDelete returns true if AnnVolumeSnapshotBeingDeleted annotation is set
579
+
580
+ // NOTE(xyang): Handle create snapshot timeout
581
+ // 2) shouldDelete returns false if AnnVolumeSnapshotBeingCreated
582
+ // annotation is set. This indicates a CreateSnapshot CSI RPC has
583
+ // not responded with success or failure.
584
+ // We need to keep waiting for a response from the CSI driver.
585
+ if metav1 .HasAnnotation (content .ObjectMeta , utils .AnnVolumeSnapshotBeingCreated ) {
586
+ return false
587
+ }
588
+
589
+ // 3) shouldDelete returns true if AnnVolumeSnapshotBeingDeleted annotation is set
568
590
if metav1 .HasAnnotation (content .ObjectMeta , utils .AnnVolumeSnapshotBeingDeleted ) {
569
591
return true
570
592
}
571
593
return false
572
594
}
595
+
596
+ // setAnnVolumeSnapshotBeingCreated sets VolumeSnapshotBeingCreated annotation
597
+ // on VolumeSnapshotContent
598
+ // If set, it indicates snapshot is being created
599
+ func (ctrl * csiSnapshotSideCarController ) setAnnVolumeSnapshotBeingCreated (content * crdv1.VolumeSnapshotContent ) error {
600
+ if metav1 .HasAnnotation (content .ObjectMeta , utils .AnnVolumeSnapshotBeingCreated ) {
601
+ // the annotation already exists, return directly
602
+ return nil
603
+ }
604
+
605
+ // Set AnnVolumeSnapshotBeingCreated
606
+ klog .V (5 ).Infof ("setAnnVolumeSnapshotBeingCreated: set annotation [%s:yes] on content [%s]." , utils .AnnVolumeSnapshotBeingCreated , content .Name )
607
+ contentClone := content .DeepCopy ()
608
+ metav1 .SetMetaDataAnnotation (& contentClone .ObjectMeta , utils .AnnVolumeSnapshotBeingCreated , "yes" )
609
+
610
+ updatedContent , err := ctrl .clientset .SnapshotV1beta1 ().VolumeSnapshotContents ().Update (contentClone )
611
+ if err != nil {
612
+ return newControllerUpdateError (content .Name , err .Error ())
613
+ }
614
+ // update content if update is successful
615
+ content = updatedContent
616
+
617
+ _ , err = ctrl .storeContentUpdate (content )
618
+ if err != nil {
619
+ klog .V (4 ).Infof ("setAnnVolumeSnapshotBeingCreated for content [%s]: cannot update internal cache %v" , content .Name , err )
620
+ }
621
+ klog .V (5 ).Infof ("setAnnVolumeSnapshotBeingCreated: volume snapshot content %+v" , content )
622
+
623
+ return nil
624
+ }
625
+
626
+ // removeAnnVolumeSnapshotBeingCreated removes the VolumeSnapshotBeingCreated
627
+ // annotation from a content if there exists one.
628
+ func (ctrl csiSnapshotSideCarController ) removeAnnVolumeSnapshotBeingCreated (content * crdv1.VolumeSnapshotContent ) error {
629
+ if ! metav1 .HasAnnotation (content .ObjectMeta , utils .AnnVolumeSnapshotBeingCreated ) {
630
+ // the annotation does not exist, return directly
631
+ return nil
632
+ }
633
+ contentClone := content .DeepCopy ()
634
+ delete (contentClone .ObjectMeta .Annotations , utils .AnnVolumeSnapshotBeingCreated )
635
+
636
+ updatedContent , err := ctrl .clientset .SnapshotV1beta1 ().VolumeSnapshotContents ().Update (contentClone )
637
+ if err != nil {
638
+ return newControllerUpdateError (content .Name , err .Error ())
639
+ }
640
+ // update content if update is successful
641
+ content = updatedContent
642
+
643
+ klog .V (5 ).Infof ("Removed VolumeSnapshotBeingCreated annotation from volume snapshot content %s" , content .Name )
644
+ _ , err = ctrl .storeContentUpdate (content )
645
+ if err != nil {
646
+ klog .Errorf ("failed to update content store %v" , err )
647
+ }
648
+ return nil
649
+ }
650
+
651
+ // This function checks if the error is final
652
+ func isCSIFinalError (err error ) bool {
653
+ // Sources:
654
+ // https://github.com/grpc/grpc/blob/master/doc/statuscodes.md
655
+ // https://github.com/container-storage-interface/spec/blob/master/spec.md
656
+ st , ok := status .FromError (err )
657
+ if ! ok {
658
+ // This is not gRPC error. The operation must have failed before gRPC
659
+ // method was called, otherwise we would get gRPC error.
660
+ // We don't know if any previous CreateSnapshot is in progress, be on the safe side.
661
+ return false
662
+ }
663
+ switch st .Code () {
664
+ case codes .Canceled , // gRPC: Client Application cancelled the request
665
+ codes .DeadlineExceeded , // gRPC: Timeout
666
+ codes .Unavailable , // gRPC: Server shutting down, TCP connection broken - previous CreateSnapshot() may be still in progress.
667
+ codes .ResourceExhausted , // gRPC: Server temporarily out of resources - previous CreateSnapshot() may be still in progress.
668
+ codes .Aborted : // CSI: Operation pending for Snapshot
669
+ return false
670
+ }
671
+ // All other errors mean that creating snapshot either did not
672
+ // even start or failed. It is for sure not in progress.
673
+ return true
674
+ }
0 commit comments