Skip to content

Wrap all GRPC errors in status, fix semantics of NotFound errors #368

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

[[constraint]]
name = "github.com/kubernetes-csi/csi-test"
version = "v2.0.1"
version = "2.2.0"

[[constraint]]
branch = "master"
Expand Down
4 changes: 4 additions & 0 deletions pkg/common/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,7 @@ func GetDeviceName(volKey *meta.Key) (string, error) {
func CreateNodeID(project, zone, name string) string {
return fmt.Sprintf(nodeIDFmt, project, zone, name)
}

func CreateZonalVolumeID(project, zone, name string) string {
return fmt.Sprintf(volIDZonalFmt, project, zone, name)
}
4 changes: 2 additions & 2 deletions pkg/gce-cloud-provider/compute/fake-gce.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func (cloud *FakeCloudProvider) RepairUnderspecifiedVolumeKey(ctx context.Contex
return volumeKey, nil
}
}
return nil, fmt.Errorf("Couldn't repair unspecified volume information")
return nil, notFoundError()
case meta.Regional:
if volumeKey.Region != common.UnspecifiedValue {
return volumeKey, nil
Expand Down Expand Up @@ -375,7 +375,7 @@ func (cloud *FakeCloudProvider) ResizeDisk(ctx context.Context, volKey *meta.Key

disk.setSizeGb(common.BytesToGb(requestBytes))

return requestBytes, nil
return common.BytesToGb(requestBytes), nil

}

Expand Down
25 changes: 20 additions & 5 deletions pkg/gce-cloud-provider/compute/gce-compute.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ func (cloud *CloudProvider) RepairUnderspecifiedVolumeKey(ctx context.Context, v
}
switch volumeKey.Type() {
case meta.Zonal:
foundZone := ""
if volumeKey.Zone == common.UnspecifiedValue {
// list all zones, try to get disk in each zone
zones, err := cloud.ListZones(ctx, region)
Expand All @@ -80,13 +81,27 @@ func (cloud *CloudProvider) RepairUnderspecifiedVolumeKey(ctx context.Context, v
}
for _, zone := range zones {
_, err := cloud.getZonalDiskOrError(ctx, zone, volumeKey.Name)
if err == nil {
// If there is no error we have found a disk
volumeKey.Zone = zone
return volumeKey, nil
if err != nil {
if IsGCENotFoundError(err) {
// Couldn't find the disk in this zone so we keep
// looking
continue
}
// There is some miscellaneous error getting disk from zone
// so we return error immediately
return nil, err
}
if len(foundZone) > 0 {
return nil, fmt.Errorf("found disk %s in more than one zone: %s and %s", volumeKey.Name, foundZone, zone)
}
foundZone = zone
}

if len(foundZone) == 0 {
return nil, notFoundError()
}
return nil, fmt.Errorf("volume zone unspecified and unable to find in any of these zones %v", zones)
volumeKey.Zone = foundZone
return volumeKey, nil
}
return volumeKey, nil
case meta.Regional:
Expand Down
11 changes: 9 additions & 2 deletions pkg/gce-cloud-provider/compute/gce.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@ package gcecloudprovider
import (
"context"
"fmt"
"golang.org/x/oauth2/google"
"gopkg.in/gcfg.v1"
"net/http"
"os"
"runtime"
"time"

"golang.org/x/oauth2/google"
"gopkg.in/gcfg.v1"

"cloud.google.com/go/compute/metadata"
"golang.org/x/oauth2"
beta "google.golang.org/api/compute/v0.beta"
Expand Down Expand Up @@ -242,3 +243,9 @@ func IsGCEError(err error, reason string) bool {
}
return false
}

// IsGCENotFoundError returns true if the error is a googleapi.Error with
// notFound reason
func IsGCENotFoundError(err error) bool {
return IsGCEError(err, "notFound")
}
48 changes: 35 additions & 13 deletions pkg/gce-pd-csi-driver/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,14 +224,19 @@ func (gceCS *GCEControllerServer) DeleteVolume(ctx context.Context, req *csi.Del

volKey, err := common.VolumeIDToKey(volumeID)
if err != nil {
// Cannot find volume associated with this ID because can't even get the name or zone
// This is a success according to the spec
// Cannot find volume associated with this ID because VolumeID is not in
// correct format, this is a success according to the Spec
klog.Warningf("DeleteVolume treating volume as deleted because volume id %s is invalid: %v", volumeID, err)
return &csi.DeleteVolumeResponse{}, nil
}

volKey, err = gceCS.CloudProvider.RepairUnderspecifiedVolumeKey(ctx, volKey)
if err != nil {
return nil, status.Error(codes.NotFound, fmt.Sprintf("Could not find volume with ID %v: %v", volumeID, err))
if gce.IsGCENotFoundError(err) {
klog.Warningf("DeleteVolume treating volume as deleted because cannot find volume %v: %v", volumeID, err)
return &csi.DeleteVolumeResponse{}, nil
}
return nil, status.Errorf(codes.Internal, "DeleteVolume error repairing underspecified volume key: %v", err)
}

if acquired := gceCS.volumeLocks.TryAcquire(volumeID); !acquired {
Expand Down Expand Up @@ -267,12 +272,15 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r

volKey, err := common.VolumeIDToKey(volumeID)
if err != nil {
return nil, status.Error(codes.NotFound, fmt.Sprintf("Could not find volume with ID %v: %v", volumeID, err))
return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("ControllerPublishVolume volume ID is invalid: %v", err))
}

volKey, err = gceCS.CloudProvider.RepairUnderspecifiedVolumeKey(ctx, volKey)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to check gce error codes here? What if we had temporary issues with the cloud provider, and not that the disk doesn't actually exist.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's no real way to distinguish between "disk not exists in zone because it may or may not exist in another zone but is temporarily not showing up" vs "disk does not exist in any zone because it just doesn't exist" since RepairUnderspecifiedVolumeKey queries all the zones in the region for the disk

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could check if the zone check fails because of "not found" error code

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right, but it may be NotFound is some subset of zones, and some other error in other zones. What do we do in that case?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we looped through all zones and found it, then return success. Otherwise, if all the zones returned notFound, then return notFound. Otherwise return error?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can't be sure, but I believe this is consistent with behavior of in-tree plugin. This codepath is only hit for migration - any disks managed natively through the CSI Driver have zone/region information encoded in their volume ID

Copy link
Contributor

@msau42 msau42 Aug 7, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be on the safe side, can we search all zones and if we have multiple matches then return error?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this issue kubernetes/kubernetes#65198, @verult mentioned that csi driver solve this issue, how it is solved?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When using the CSI Driver natively the region/zone information is encoded in the volume ID. This unspecified/repair case is only for CSI Migration, in which case we will continue to have the same issue as before.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alright i've done the original repair suggestion in a seperate commit, PTAL

if err != nil {
return nil, status.Error(codes.NotFound, fmt.Sprintf("Could not find volume with ID %v: %v", volumeID, err))
if gce.IsGCENotFoundError(err) {
return nil, status.Errorf(codes.NotFound, "ControllerPublishVolume could not find volume with ID %v: %v", volumeID, err)
}
return nil, status.Errorf(codes.Internal, "ControllerPublishVolume error repairing underspecified volume key: %v", err)
}

// Acquires the lock for the volume on that node only, because we need to support the ability
Expand All @@ -294,7 +302,7 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r

_, err = gceCS.CloudProvider.GetDisk(ctx, volKey)
if err != nil {
if gce.IsGCEError(err, "notFound") {
if gce.IsGCENotFoundError(err) {
return nil, status.Error(codes.NotFound, fmt.Sprintf("Could not find disk %v: %v", volKey.String(), err))
}
return nil, status.Error(codes.Internal, fmt.Sprintf("Unknown get disk error: %v", err))
Expand All @@ -305,7 +313,7 @@ func (gceCS *GCEControllerServer) ControllerPublishVolume(ctx context.Context, r
}
instance, err := gceCS.CloudProvider.GetInstanceOrError(ctx, instanceZone, instanceName)
if err != nil {
if gce.IsGCEError(err, "notFound") {
if gce.IsGCENotFoundError(err) {
return nil, status.Error(codes.NotFound, fmt.Sprintf("Could not find instance %v: %v", nodeID, err))
}
return nil, status.Error(codes.Internal, fmt.Sprintf("Unknown get instance error: %v", err))
Expand Down Expand Up @@ -365,7 +373,7 @@ func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context,

volKey, err := common.VolumeIDToKey(volumeID)
if err != nil {
return nil, err
return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("ControllerUnpublishVolume Volume ID is invalid: %v", err))
}

// Acquires the lock for the volume on that node only, because we need to support the ability
Expand All @@ -382,7 +390,12 @@ func (gceCS *GCEControllerServer) ControllerUnpublishVolume(ctx context.Context,
}
instance, err := gceCS.CloudProvider.GetInstanceOrError(ctx, instanceZone, instanceName)
if err != nil {
return nil, err
if gce.IsGCENotFoundError(err) {
// Node not existing on GCE means that disk has been detached
klog.Warningf("Treating volume %v as unpublished because node %v could not be found", volKey.String(), instanceName)
return &csi.ControllerUnpublishVolumeResponse{}, nil
}
return nil, status.Error(codes.Internal, fmt.Sprintf("error getting instance: %v", err))
}

deviceName, err := common.GetDeviceName(volKey)
Expand Down Expand Up @@ -420,7 +433,7 @@ func (gceCS *GCEControllerServer) ValidateVolumeCapabilities(ctx context.Context
}
volKey, err := common.VolumeIDToKey(volumeID)
if err != nil {
return nil, status.Error(codes.NotFound, fmt.Sprintf("Volume ID is of improper format, got %v", volumeID))
return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("ValidateVolumeCapabilities Volume ID is invalid: %v", err))
}

if acquired := gceCS.volumeLocks.TryAcquire(volumeID); !acquired {
Expand All @@ -430,7 +443,7 @@ func (gceCS *GCEControllerServer) ValidateVolumeCapabilities(ctx context.Context

_, err = gceCS.CloudProvider.GetDisk(ctx, volKey)
if err != nil {
if gce.IsGCEError(err, "notFound") {
if gce.IsGCENotFoundError(err) {
return nil, status.Error(codes.NotFound, fmt.Sprintf("Could not find disk %v: %v", volKey.Name, err))
}
return nil, status.Error(codes.Internal, fmt.Sprintf("Unknown get disk error: %v", err))
Expand Down Expand Up @@ -532,14 +545,23 @@ func (gceCS *GCEControllerServer) CreateSnapshot(ctx context.Context, req *csi.C
}
volKey, err := common.VolumeIDToKey(volumeID)
if err != nil {
return nil, status.Error(codes.NotFound, fmt.Sprintf("Could not find volume with ID %v: %v", volumeID, err))
return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("CreateSnapshot Volume ID is invalid: %v", err))
}

if acquired := gceCS.volumeLocks.TryAcquire(volumeID); !acquired {
return nil, status.Errorf(codes.Aborted, common.VolumeOperationAlreadyExistsFmt, volumeID)
}
defer gceCS.volumeLocks.Release(volumeID)

// Check if volume exists
_, err = gceCS.CloudProvider.GetDisk(ctx, volKey)
if err != nil {
if gce.IsGCENotFoundError(err) {
return nil, status.Error(codes.NotFound, fmt.Sprintf("CreateSnapshot could not find disk %v: %v", volKey.String(), err))
}
return nil, status.Error(codes.Internal, fmt.Sprintf("CreateSnapshot unknown get disk error: %v", err))
}

// Check if snapshot already exists
var snapshot *compute.Snapshot
snapshot, err = gceCS.CloudProvider.GetSnapshot(ctx, req.Name)
Expand Down Expand Up @@ -670,7 +692,7 @@ func (gceCS *GCEControllerServer) ControllerExpandVolume(ctx context.Context, re

volKey, err := common.VolumeIDToKey(volumeID)
if err != nil {
return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("ControllerExpandVolume volume ID is invalid: %v", err))
return nil, status.Error(codes.InvalidArgument, fmt.Sprintf("ControllerExpandVolume Volume ID is invalid: %v", err))
}

resizedGb, err := gceCS.CloudProvider.ResizeDisk(ctx, volKey, reqBytes)
Expand Down
Loading