Skip to content

cleanup: refactor volume cloning #1521

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/blob/blob.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@ var (
supportedProtocolList = []string{Fuse, Fuse2, NFS, AZNFS}
retriableErrors = []string{accountNotProvisioned, tooManyRequests, statusCodeNotFound, containerBeingDeletedDataplaneAPIError, containerBeingDeletedManagementAPIError, clientThrottled}
supportedFSGroupChangePolicyList = []string{FSGroupChangeNone, string(v1.FSGroupChangeAlways), string(v1.FSGroupChangeOnRootMismatch)}

// azcopyCloneVolumeOptions used in volume cloning between different storage account and --check-length to false because volume data may be in changing state, copy volume is not same as current source volume,
// set --s2s-preserve-access-tier=false to avoid BlobAccessTierNotSupportedForAccountType error in azcopy
azcopyCloneVolumeOptions = []string{"--recursive", "--check-length=false", "--s2s-preserve-access-tier=false"}
)

// DriverOptions defines driver parameters specified in driver deployment
Expand Down
103 changes: 59 additions & 44 deletions pkg/blob/controllerserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,36 +366,12 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
defer d.volumeLocks.Release(volName)

requestName := "controller_create_volume"

var srcAzcopyAuthEnv []string
var srcSubscriptionID, srcResourceGroupName, srcAccountName, srcContainerName, srcPath, srcAccountSASToken string
if volContentSource != nil {
switch volContentSource.Type.(type) {
case *csi.VolumeContentSource_Snapshot:
return nil, status.Errorf(codes.InvalidArgument, "VolumeContentSource Snapshot is not yet implemented")
case *csi.VolumeContentSource_Volume:
requestName = "controller_create_volume_from_volume"
var srcVolumeID string
if volContentSource.GetVolume() != nil {
srcVolumeID = volContentSource.GetVolume().GetVolumeId()
}
srcResourceGroupName, srcAccountName, srcContainerName, _, srcSubscriptionID, err = GetContainerInfo(srcVolumeID)
if err != nil {
return nil, status.Error(codes.NotFound, err.Error())
}
srcAccountOptions := &azure.AccountOptions{
Name: srcAccountName,
SubscriptionID: srcSubscriptionID,
ResourceGroup: srcResourceGroupName,
GetLatestAccountKey: getLatestAccountKey,
}
srcAccountSASToken, srcAzcopyAuthEnv, err = d.getAzcopyAuth(ctx, srcAccountName, "", storageEndpointSuffix, srcAccountOptions, secrets, secretName, secretNamespace)
if err != nil {
return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err)
}
srcPath = fmt.Sprintf("https://%s.blob.%s/%s", srcAccountName, storageEndpointSuffix, srcContainerName)
default:
return nil, status.Errorf(codes.InvalidArgument, "VolumeContentSource is not recognized: %v", volContentSource)
}
}

Expand Down Expand Up @@ -466,16 +442,11 @@ func (d *Driver) CreateVolume(ctx context.Context, req *csi.CreateVolumeRequest)
return nil, status.Errorf(codes.Internal, "failed to create container(%s) on account(%s) type(%s) rg(%s) location(%s) size(%d), error: %v", validContainerName, accountName, storageAccountType, resourceGroup, location, requestGiB, err)
}
if volContentSource != nil {
dstAzcopyAuthEnv := srcAzcopyAuthEnv
dstAccountSASToken := srcAccountSASToken
if srcAccountName != accountName {
if dstAccountSASToken, dstAzcopyAuthEnv, err = d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secrets, secretName, secretNamespace); err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might have been causing your failures.

getAzcopyAuth() has a few issues:

  • only tries to use MSI auth when len(secrets) == 0 && len(secretName) == 0 .
  • tries to get storage access key and sas token as a fallback even if disabled on the account
  • uses the cluster identity to get the fallback key / sas which requires even more permissions

All of this account key / sas handling seems like hacks around the root issue that the default permissions for the control plane identity neglected to include Storage Blob Data Contributor along with Contributor.

Changing secrets, secretName, secretNamespace to nil, "", "" would be a quick fix, and preferably getAzcopyAuth would always try the identity first.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could address this in other PR, this is refactor not bug fix

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also tried again and Reader can be worked after it failed before. But I think the reason is not here since we have use MSI in that test since len(secrets) == 0 && len(secretName) == 0 , I will still work on this issue to find the reason.

return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err)
}
accountSASToken, authAzcopyEnv, err := d.getAzcopyAuth(ctx, accountName, accountKey, storageEndpointSuffix, accountOptions, secrets, secretName, secretNamespace)
if err != nil {
return nil, status.Errorf(codes.Internal, "failed to getAzcopyAuth on account(%s) rg(%s), error: %v", accountOptions.Name, accountOptions.ResourceGroup, err)
}

dstPath := fmt.Sprintf("https://%s.blob.%s/%s", accountName, storageEndpointSuffix, validContainerName)
if err := d.copyBlobContainer(dstAzcopyAuthEnv, srcPath, srcAccountSASToken, dstPath, dstAccountSASToken, validContainerName); err != nil {
if err := d.copyVolume(ctx, req, accountName, accountSASToken, authAzcopyEnv, validContainerName, secretNamespace, accountOptions, storageEndpointSuffix); err != nil {
return nil, err
}
}
Expand Down Expand Up @@ -786,11 +757,36 @@ func (d *Driver) DeleteBlobContainer(ctx context.Context, subsID, resourceGroupN
}

// copyBlobContainer copies source volume content into a destination volume
func (d *Driver) copyBlobContainer(authAzcopyEnv []string, srcPath string, srcAccountSASToken string, dstPath string, dstAccountSASToken string, dstContainerName string) error {
func (d *Driver) copyBlobContainer(ctx context.Context, req *csi.CreateVolumeRequest, dstAccountName string, dstAccountSasToken string, authAzcopyEnv []string, dstContainerName string, secretNamespace string, accountOptions *azure.AccountOptions, storageEndpointSuffix string) error {
var sourceVolumeID string
if req.GetVolumeContentSource() != nil && req.GetVolumeContentSource().GetVolume() != nil {
sourceVolumeID = req.GetVolumeContentSource().GetVolume().GetVolumeId()

if srcPath == "" || dstPath == "" || dstContainerName == "" {
return fmt.Errorf("srcPath(%s) or dstPath(%s) or dstContainerName(%s) is empty", srcPath, dstPath, dstContainerName)
}
srcResourceGroupName, srcAccountName, srcContainerName, _, srcSubscriptionID, err := GetContainerInfo(sourceVolumeID) //nolint:dogsled
if err != nil {
return status.Error(codes.NotFound, err.Error())
}
if dstAccountName == "" {
dstAccountName = srcAccountName
}
if srcAccountName == "" || srcContainerName == "" || dstContainerName == "" {
return fmt.Errorf("srcAccountName(%s) or srcContainerName(%s) or dstContainerName(%s) is empty", srcAccountName, srcContainerName, dstContainerName)
}
srcAccountSasToken := dstAccountSasToken
if srcAccountName != dstAccountName && dstAccountSasToken != "" {
srcAccountOptions := &azure.AccountOptions{
Name: srcAccountName,
ResourceGroup: srcResourceGroupName,
SubscriptionID: srcSubscriptionID,
GetLatestAccountKey: accountOptions.GetLatestAccountKey,
}
if srcAccountSasToken, _, err = d.getAzcopyAuth(ctx, srcAccountName, "", storageEndpointSuffix, srcAccountOptions, nil, "", secretNamespace); err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does getAzcopyAuth work on a source volume for which the controller identity only has Storage Blob Data Reader.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it may failed with AuthorizationPermissionMismatch error, as azcopy team mentioned, it needs Storage Blob Data Contributor role

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't know here, from intuition, Storage Blob Data Reader should work on source volume, need to verify, and if it failed, then fallback to use sas token

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested it to only give Storage Blob Data Reader in source blob, and it also return AuthorizationPermissionMismatch error from azcopy.

2024/08/01 08:46:05 INFO: [P#0-T#0] Starting transfer: Source "https://fuse256643cb981cf47f286.blob.core.windows.net/pvc-454d10fc-47c0-4d75-847c-48b4093c6261/outfile" Destination "https://fuse29375444a7b114aaf91.blob.core.windows.net/pvc-88f8b354-4d77-4097-a65b-0ecb93607e87/outfile". Specified chunk size 8388608
2024/08/01 08:46:05 ==> REQUEST/RESPONSE (Try=1/9.557829ms, OpTime=30.325026ms) -- RESPONSE STATUS CODE ERROR
   PUT https://fuse29375444a7b114aaf91.blob.core.windows.net/pvc-88f8b354-4d77-4097-a65b-0ecb93607e87/outfile
   Accept: application/xml
   Authorization: REDACTED
   Content-Length: 0
   User-Agent: AzCopy/10.25.1 azsdk-go-azblob/v1.3.1 (go1.22.4; linux)
   X-Ms-Client-Request-Id: 79584760-5d17-4747-4048-df4f78b38efe
   x-ms-blob-content-md5: tTzxIJei9Hch9GOVLgQ+WA==
   x-ms-blob-content-type: application/octet-stream
   x-ms-blob-type: BlockBlob
   x-ms-copy-source: https://fuse256643cb981cf47f286.blob.core.windows.net/pvc-454d10fc-47c0-4d75-847c-48b4093c6261/outfile
   x-ms-copy-source-authorization: REDACTED
   x-ms-version: 2023-08-03
   --------------------------------------------------------------------------------
   RESPONSE Status: 403 This request is not authorized to perform this operation using this permission.
   Content-Length: 279
   Content-Type: application/xml
   Date: Thu, 01 Aug 2024 08:46:05 GMT
   Server: Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0
   X-Ms-Client-Request-Id: 79584760-5d17-4747-4048-df4f78b38efe
   X-Ms-Error-Code: AuthorizationPermissionMismatch
   X-Ms-Request-Id: aa17dd77-001e-0022-5def-e3ef22000000
   X-Ms-Version: 2023-08-03
Response Details: <Code>AuthorizationPermissionMismatch</Code><Message>This request is not authorized to perform this operation using this permission. </Message>

2024/08/01 08:46:05 ERR: [P#0-T#0] COPYFAILED: https://fuse256643cb981cf47f286.blob.core.windows.net/pvc-454d10fc-47c0-4d75-847c-48b4093c6261/outfile : 403 : 403 This request is not authorized to perform this operation using this permission.. When Put Blob from URL. X-Ms-Request-Id: aa17dd77-001e-0022-5def-e3ef22000000

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems strange and doesn't match the public docs for AzCopy. I asked the team and they indicated that:

This request appears to be a response from the destination account. It is not a source validation failure, which indicates that it's the actual request to the destination, not the source that's failing. Furthermore, it's AuthorizationPermissionMismatch, meaning that authorization has succeeded, but permissions are lacking.

In short-- sounds like perms are lacking on the destination.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's a bicep POC that shows azcopy itself does work with only Blob Data Reader on the source and Blob Data Contributor on the destination.

The blob-csi-driver should support it as well for at least MSI scenarios.

param stem string = uniqueString(resourceGroup().name)
param location string = resourceGroup().location
param azCliVersion string = '2.52.0'
param forceUpdateTag string = utcNow()

var blobOwnerRoleId = subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b7e6dc6d-f1e8-4753-8033-0f276bb0955b')
var blobReaderRoleId = subscriptionResourceId('Microsoft.Authorization/roleDefinitions', '2a2b9908-6ea1-4ae2-8e65-a410df84e7d1')
var blobContributorRoleId = subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'ba92f5b4-2d11-453d-a403-e96b0029c9fe')

resource srcAccount 'Microsoft.Storage/storageAccounts@2023-01-01' = {
  name: '${stem}src'
  location: location
  sku: {
    name: 'Standard_LRS'
  }
  kind: 'StorageV2'
  properties: {
    accessTier: 'Hot'
    allowSharedKeyAccess: false
    defaultToOAuthAuthentication: true
    isLocalUserEnabled: false
  }
}

resource dstAccount 'Microsoft.Storage/storageAccounts@2023-01-01' = {
  name: '${stem}dst'
  location: location
  sku: {
    name: 'Standard_LRS'
  }
  kind: 'StorageV2'
  properties: {
    accessTier: 'Hot'
    allowSharedKeyAccess: false
    defaultToOAuthAuthentication: true
    isLocalUserEnabled: false
  }
}

resource makeIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = {
  name: '${stem}-make-id'

  location: location
}

resource copyIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = {
  name: '${stem}-copy-id'
  location: location
}

resource makeSourceBlobOwner 'Microsoft.Authorization/roleAssignments@2022-04-01' = {
  name: guid(makeIdentity.id, blobOwnerRoleId, srcAccount.id)
  properties: {
    roleDefinitionId: blobOwnerRoleId
    principalId: makeIdentity.properties.principalId
  }
  scope: srcAccount
}

resource copySourceBlobReader 'Microsoft.Authorization/roleAssignments@2022-04-01' = {
  name: guid(copyIdentity.id, blobReaderRoleId, srcAccount.id)
  properties: {
    roleDefinitionId: blobReaderRoleId
    principalId: copyIdentity.properties.principalId
  }
  scope: srcAccount
}


resource copyDestinationBlobContributor 'Microsoft.Authorization/roleAssignments@2022-04-01' = {
  name: guid(copyIdentity.id, blobContributorRoleId, dstAccount.id)
  properties: {
    roleDefinitionId: blobContributorRoleId
    principalId: copyIdentity.properties.principalId
  }
  scope: dstAccount
}

resource makeScript 'Microsoft.Resources/deploymentScripts@2023-08-01' = {
  name: '${stem}-make-script'
  location: location
  kind: 'AzureCLI'
  identity: {
    type: 'UserAssigned'
    userAssignedIdentities: {
      '${makeIdentity.id}': {}
    }
  }
  properties: {
    azCliVersion: azCliVersion
    forceUpdateTag: forceUpdateTag
    environmentVariables: [
      {
        name: 'AZCOPY_AUTO_LOGIN_TYPE'
        value: 'MSI'
      }
      {
        name: 'SRCACCOUNT'
        value: srcAccount.properties.primaryEndpoints.blob
      }
      {
        name: 'SRCCONTAINER'
        value: 'data'
      }
    ]
    scriptContent: '${installScriptContent}\n${makeScriptContent}'
    retentionInterval: 'PT1H'
  }
  dependsOn: [
    makeSourceBlobOwner
  ]
}

resource copyScript 'Microsoft.Resources/deploymentScripts@2023-08-01' = {
  name: '${stem}-copy-script'
  location: location
  kind: 'AzureCLI'
  identity: {
    type: 'UserAssigned'
    userAssignedIdentities: {
      '${copyIdentity.id}': {}
    }
  }
  properties: {
    azCliVersion: azCliVersion
    forceUpdateTag: forceUpdateTag
    environmentVariables: [
      {
        name: 'AZCOPY_AUTO_LOGIN_TYPE'
        value: 'MSI'
      }
      {
        name: 'SRCACCOUNT'
        value: srcAccount.properties.primaryEndpoints.blob
      }
      {
        name: 'SRCCONTAINER'
        value: 'data'
      }
      {
        name: 'DSTACCOUNT'
        value: dstAccount.properties.primaryEndpoints.blob
      }
      {
        name: 'DSTCONTAINER'
        value: 'data'
      }
    ]
    scriptContent: '${installScriptContent}\n${copyScriptContent}'
    retentionInterval: 'PT1H'
  }
  dependsOn: [
    copySourceBlobReader
    copyDestinationBlobContributor
    makeScript
  ]
}

var installScriptContent = '''
cd /tmp
mkdir bin
apk add -q --no-interactive gcompat
wget -q https://aka.ms/downloadazcopy-v10-linux
tar -xf downloadazcopy-v10-linux
cp -vf ./azcopy_linux_amd64_*/azcopy bin/
rm -rf ./azcopy_linux_amd64_* downloadazcopy-v10-linux
'''

var makeScriptContent = '''
mkdir data
date > data/0.txt
date > data/1.txt
date > data/3.txt
date > data/4.txt
./bin/azcopy jobs list
./bin/azcopy list ${SRCACCOUNT}
./bin/azcopy make ${SRCACCOUNT}${SRCCONTAINER}
./bin/azcopy copy data ${SRCACCOUNT}${SRCCONTAINER} --recursive
 '''

var copyScriptContent = '''
./bin/azcopy jobs list
./bin/azcopy list ${SRCACCOUNT}
./bin/azcopy list ${DSTACCOUNT}
./bin/azcopy copy ${SRCACCOUNT}${SRCCONTAINER} ${DSTACCOUNT}${DSTCONTAINER} --recursive --check-length=false --s2s-preserve-access-tier=false
'''

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the verification, we could address this in the doc or other PR

return err
}
}
srcPath := fmt.Sprintf("https://%s.blob.%s/%s%s", srcAccountName, storageEndpointSuffix, srcContainerName, srcAccountSasToken)
dstPath := fmt.Sprintf("https://%s.blob.%s/%s%s", dstAccountName, storageEndpointSuffix, dstContainerName, dstAccountSasToken)

jobState, percent, err := d.azcopy.GetAzcopyJob(dstContainerName, authAzcopyEnv)
klog.V(2).Infof("azcopy job status: %s, copy percent: %s%%, error: %v", jobState, percent, err)
Expand All @@ -800,13 +796,9 @@ func (d *Driver) copyBlobContainer(authAzcopyEnv []string, srcPath string, srcAc
case util.AzcopyJobRunning:
return fmt.Errorf("wait for the existing AzCopy job to complete, current copy percentage is %s%%", percent)
case util.AzcopyJobNotFound:
klog.V(2).Infof("copy blob container %s to %s", srcPath, dstContainerName)
klog.V(2).Infof("copy blob container %s:%s to %s:%s", srcAccountName, srcContainerName, dstAccountName, dstContainerName)
execFunc := func() error {
cmd := exec.Command("azcopy", "copy", srcPath+srcAccountSASToken, dstPath+dstAccountSASToken, "--recursive", "--check-length=false", "--s2s-preserve-access-tier=false")
if len(authAzcopyEnv) > 0 {
cmd.Env = append(os.Environ(), authAzcopyEnv...)
}
if out, err := cmd.CombinedOutput(); err != nil {
if out, err := d.execAzcopyCopy(srcPath, dstPath, azcopyCloneVolumeOptions, authAzcopyEnv); err != nil {
return fmt.Errorf("exec error: %v, output: %v", err, string(out))
}
return nil
Expand All @@ -817,15 +809,38 @@ func (d *Driver) copyBlobContainer(authAzcopyEnv []string, srcPath string, srcAc
}
copyErr := util.WaitUntilTimeout(time.Duration(d.waitForAzCopyTimeoutMinutes)*time.Minute, execFunc, timeoutFunc)
if copyErr != nil {
klog.Warningf("CopyBlobContainer(%s, %s, %s) failed with error: %v", srcPath, dstPath, dstContainerName, copyErr)
klog.Warningf("CopyBlobContainer(%s, %s, %s) failed with error: %v", accountOptions.ResourceGroup, dstAccountName, dstContainerName, copyErr)
} else {
klog.V(2).Infof("copied blob container %s to %s successfully", srcPath, dstContainerName)
klog.V(2).Infof("copied blob container %s to %s successfully", srcContainerName, dstContainerName)
}
return copyErr
}
return err
}

// copyVolume copies a volume form volume or snapshot, snapshot is not supported now
func (d *Driver) copyVolume(ctx context.Context, req *csi.CreateVolumeRequest, accountName string, accountSASToken string, authAzcopyEnv []string, dstContainerName, secretNamespace string, accountOptions *azure.AccountOptions, storageEndpointSuffix string) error {
vs := req.VolumeContentSource
switch vs.Type.(type) {
case *csi.VolumeContentSource_Snapshot:
return status.Errorf(codes.InvalidArgument, "VolumeContentSource Snapshot is not yet implemented")
case *csi.VolumeContentSource_Volume:
return d.copyBlobContainer(ctx, req, accountName, accountSASToken, authAzcopyEnv, dstContainerName, secretNamespace, accountOptions, storageEndpointSuffix)
default:
return status.Errorf(codes.InvalidArgument, "%v is not a proper volume source", vs)
}
}

// execAzcopyCopy exec azcopy copy command
func (d *Driver) execAzcopyCopy(srcPath, dstPath string, azcopyCopyOptions, authAzcopyEnv []string) ([]byte, error) {
cmd := exec.Command("azcopy", "copy", srcPath, dstPath)
cmd.Args = append(cmd.Args, azcopyCopyOptions...)
if len(authAzcopyEnv) > 0 {
cmd.Env = append(os.Environ(), authAzcopyEnv...)
}
return cmd.CombinedOutput()
}

// authorizeAzcopyWithIdentity returns auth env for azcopy using cluster identity
func (d *Driver) authorizeAzcopyWithIdentity() ([]string, error) {
azureAuthConfig := d.cloud.Config.AzureAuthConfig
Expand Down
Loading
Loading