|
4 | 4 | package nomad
|
5 | 5 |
|
6 | 6 | import (
|
| 7 | + "context" |
7 | 8 | "fmt"
|
8 | 9 | "net/http"
|
9 | 10 | "strings"
|
@@ -549,7 +550,9 @@ func (v *CSIVolume) controllerPublishVolume(req *structs.CSIVolumeClaimRequest,
|
549 | 550 | cReq.PluginID = plug.ID
|
550 | 551 | cResp := &cstructs.ClientCSIControllerAttachVolumeResponse{}
|
551 | 552 |
|
552 |
| - err = v.srv.RPC(method, cReq, cResp) |
| 553 | + err = v.serializedControllerRPC(plug.ID, func() error { |
| 554 | + return v.srv.RPC(method, cReq, cResp) |
| 555 | + }) |
553 | 556 | if err != nil {
|
554 | 557 | if strings.Contains(err.Error(), "FailedPrecondition") {
|
555 | 558 | return fmt.Errorf("%v: %v", structs.ErrCSIClientRPCRetryable, err)
|
@@ -586,6 +589,57 @@ func (v *CSIVolume) volAndPluginLookup(namespace, volID string) (*structs.CSIPlu
|
586 | 589 | return plug, vol, nil
|
587 | 590 | }
|
588 | 591 |
|
| 592 | +// serializedControllerRPC ensures we're only sending a single controller RPC to |
| 593 | +// a given plugin if the RPC can cause conflicting state changes. |
| 594 | +// |
| 595 | +// The CSI specification says that we SHOULD send no more than one in-flight |
| 596 | +// request per *volume* at a time, with an allowance for losing state |
| 597 | +// (ex. leadership transitions) which the plugins SHOULD handle gracefully. |
| 598 | +// |
| 599 | +// In practice many CSI plugins rely on k8s-specific sidecars for serializing |
| 600 | +// storage provider API calls globally (ex. concurrently attaching EBS volumes |
| 601 | +// to an EC2 instance results in a race for device names). So we have to be much |
| 602 | +// more conservative about concurrency in Nomad than the spec allows. |
| 603 | +func (v *CSIVolume) serializedControllerRPC(pluginID string, fn func() error) error { |
| 604 | + |
| 605 | + for { |
| 606 | + v.srv.volumeControllerLock.Lock() |
| 607 | + future := v.srv.volumeControllerFutures[pluginID] |
| 608 | + if future == nil { |
| 609 | + future, futureDone := context.WithCancel(v.srv.shutdownCtx) |
| 610 | + v.srv.volumeControllerFutures[pluginID] = future |
| 611 | + v.srv.volumeControllerLock.Unlock() |
| 612 | + |
| 613 | + err := fn() |
| 614 | + |
| 615 | + // close the future while holding the lock and not in a defer so |
| 616 | + // that we can ensure we've cleared it from the map before allowing |
| 617 | + // anyone else to take the lock and write a new one |
| 618 | + v.srv.volumeControllerLock.Lock() |
| 619 | + futureDone() |
| 620 | + delete(v.srv.volumeControllerFutures, pluginID) |
| 621 | + v.srv.volumeControllerLock.Unlock() |
| 622 | + |
| 623 | + return err |
| 624 | + } else { |
| 625 | + v.srv.volumeControllerLock.Unlock() |
| 626 | + |
| 627 | + select { |
| 628 | + case <-future.Done(): |
| 629 | + continue |
| 630 | + case <-v.srv.shutdownCh: |
| 631 | + // The csi_hook publish workflow on the client will retry if it |
| 632 | + // gets this error. On unpublish, we don't want to block client |
| 633 | + // shutdown so we give up on error. The new leader's |
| 634 | + // volumewatcher will iterate all the claims at startup to |
| 635 | + // detect this and mop up any claims in the NodeDetached state |
| 636 | + // (volume GC will run periodically as well) |
| 637 | + return structs.ErrNoLeader |
| 638 | + } |
| 639 | + } |
| 640 | + } |
| 641 | +} |
| 642 | + |
589 | 643 | // allowCSIMount is called on Job register to check mount permission
|
590 | 644 | func allowCSIMount(aclObj *acl.ACL, namespace string) bool {
|
591 | 645 | return aclObj.AllowPluginRead() &&
|
@@ -863,8 +917,11 @@ func (v *CSIVolume) controllerUnpublishVolume(vol *structs.CSIVolume, claim *str
|
863 | 917 | Secrets: vol.Secrets,
|
864 | 918 | }
|
865 | 919 | req.PluginID = vol.PluginID
|
866 |
| - err = v.srv.RPC("ClientCSI.ControllerDetachVolume", req, |
867 |
| - &cstructs.ClientCSIControllerDetachVolumeResponse{}) |
| 920 | + |
| 921 | + err = v.serializedControllerRPC(vol.PluginID, func() error { |
| 922 | + return v.srv.RPC("ClientCSI.ControllerDetachVolume", req, |
| 923 | + &cstructs.ClientCSIControllerDetachVolumeResponse{}) |
| 924 | + }) |
868 | 925 | if err != nil {
|
869 | 926 | return fmt.Errorf("could not detach from controller: %v", err)
|
870 | 927 | }
|
@@ -1139,7 +1196,9 @@ func (v *CSIVolume) deleteVolume(vol *structs.CSIVolume, plugin *structs.CSIPlug
|
1139 | 1196 | cReq.PluginID = plugin.ID
|
1140 | 1197 | cResp := &cstructs.ClientCSIControllerDeleteVolumeResponse{}
|
1141 | 1198 |
|
1142 |
| - return v.srv.RPC(method, cReq, cResp) |
| 1199 | + return v.serializedControllerRPC(plugin.ID, func() error { |
| 1200 | + return v.srv.RPC(method, cReq, cResp) |
| 1201 | + }) |
1143 | 1202 | }
|
1144 | 1203 |
|
1145 | 1204 | func (v *CSIVolume) ListExternal(args *structs.CSIVolumeExternalListRequest, reply *structs.CSIVolumeExternalListResponse) error {
|
@@ -1286,7 +1345,9 @@ func (v *CSIVolume) CreateSnapshot(args *structs.CSISnapshotCreateRequest, reply
|
1286 | 1345 | }
|
1287 | 1346 | cReq.PluginID = pluginID
|
1288 | 1347 | cResp := &cstructs.ClientCSIControllerCreateSnapshotResponse{}
|
1289 |
| - err = v.srv.RPC(method, cReq, cResp) |
| 1348 | + err = v.serializedControllerRPC(pluginID, func() error { |
| 1349 | + return v.srv.RPC(method, cReq, cResp) |
| 1350 | + }) |
1290 | 1351 | if err != nil {
|
1291 | 1352 | multierror.Append(&mErr, fmt.Errorf("could not create snapshot: %v", err))
|
1292 | 1353 | continue
|
@@ -1360,7 +1421,9 @@ func (v *CSIVolume) DeleteSnapshot(args *structs.CSISnapshotDeleteRequest, reply
|
1360 | 1421 | cReq := &cstructs.ClientCSIControllerDeleteSnapshotRequest{ID: snap.ID}
|
1361 | 1422 | cReq.PluginID = plugin.ID
|
1362 | 1423 | cResp := &cstructs.ClientCSIControllerDeleteSnapshotResponse{}
|
1363 |
| - err = v.srv.RPC(method, cReq, cResp) |
| 1424 | + err = v.serializedControllerRPC(plugin.ID, func() error { |
| 1425 | + return v.srv.RPC(method, cReq, cResp) |
| 1426 | + }) |
1364 | 1427 | if err != nil {
|
1365 | 1428 | multierror.Append(&mErr, fmt.Errorf("could not delete %q: %v", snap.ID, err))
|
1366 | 1429 | }
|
|
0 commit comments