Skip to content

Commit 6dfd0cb

Browse files
[azeventhubs] Fixing checkpoint store race condition (#20727)
The checkpoint store wasn't guarding against multiple owners claiming for the first time - fixing this by using IfNoneMatch Fixes #20717
1 parent 745d967 commit 6dfd0cb

File tree

4 files changed

+183
-26
lines changed

4 files changed

+183
-26
lines changed

sdk/messaging/azeventhubs/CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
- Potential leaks for $cbs and $management when there was a partial failure. (PR#20564)
1515
- Latest go-amqp changes have been merged in with fixes for robustness.
1616
- Sending a message to an entity that is full will no longer retry. (PR#20722)
17+
- Checkpoint store handles multiple initial owners properly, allowing only one through. (PR#20727)
1718

1819
## 0.6.0 (2023-03-07)
1920

sdk/messaging/azeventhubs/checkpoints/blob_store.go

+54-26
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ func NewBlobStore(containerClient *container.Client, options *BlobStoreOptions)
4444

4545
// ClaimOwnership attempts to claim ownership of the partitions in partitionOwnership and returns
4646
// the actual partitions that were claimed.
47+
//
48+
// If we fail to claim ownership because of another update then it will be omitted from the
49+
// returned slice of [Ownership]'s. It is not considered an error.
4750
func (b *BlobStore) ClaimOwnership(ctx context.Context, partitionOwnership []azeventhubs.Ownership, options *azeventhubs.ClaimOwnershipOptions) ([]azeventhubs.Ownership, error) {
4851
var ownerships []azeventhubs.Ownership
4952

@@ -54,13 +57,12 @@ func (b *BlobStore) ClaimOwnership(ctx context.Context, partitionOwnership []aze
5457
if err != nil {
5558
return nil, err
5659
}
57-
lastModified, etag, err := b.setMetadata(ctx, blobName, newOwnershipBlobMetadata(po), po.ETag)
60+
lastModified, etag, err := b.setOwnershipMetadata(ctx, blobName, po)
5861

5962
if err != nil {
60-
if bloberror.HasCode(err, bloberror.ConditionNotMet) {
61-
// we can fail to claim ownership and that's okay - it's expected that clients will
62-
// attempt to claim with whatever state they hold locally. If they fail it just means
63-
// someone else claimed ownership before them.
63+
if bloberror.HasCode(err,
64+
bloberror.ConditionNotMet, // updated before we could update it
65+
bloberror.BlobAlreadyExists) { // created before we could create it
6466
continue
6567
}
6668

@@ -179,25 +181,28 @@ func (b *BlobStore) ListOwnership(ctx context.Context, fullyQualifiedNamespace s
179181
}
180182

181183
// UpdateCheckpoint updates a specific checkpoint with a sequence and offset.
184+
//
185+
// NOTE: This function doesn't attempt to prevent simultaneous checkpoint updates - ownership is assumed.
182186
func (b *BlobStore) UpdateCheckpoint(ctx context.Context, checkpoint azeventhubs.Checkpoint, options *azeventhubs.UpdateCheckpointOptions) error {
183187
blobName, err := nameForCheckpointBlob(checkpoint)
184188

185189
if err != nil {
186190
return err
187191
}
188192

189-
_, _, err = b.setMetadata(ctx, blobName, newCheckpointBlobMetadata(checkpoint), nil)
193+
_, _, err = b.setCheckpointMetadata(ctx, blobName, checkpoint)
190194
return err
191195
}
192196

193-
func (b *BlobStore) setMetadata(ctx context.Context, blobName string, blobMetadata map[string]*string, etag *azcore.ETag) (*time.Time, azcore.ETag, error) {
197+
func (b *BlobStore) setOwnershipMetadata(ctx context.Context, blobName string, ownership azeventhubs.Ownership) (*time.Time, azcore.ETag, error) {
198+
blobMetadata := newOwnershipBlobMetadata(ownership)
194199
blobClient := b.cc.NewBlockBlobClient(blobName)
195200

196-
if etag != nil {
201+
if ownership.ETag != nil {
197202
setMetadataResp, err := blobClient.SetMetadata(ctx, blobMetadata, &blob.SetMetadataOptions{
198203
AccessConditions: &blob.AccessConditions{
199204
ModifiedAccessConditions: &blob.ModifiedAccessConditions{
200-
IfMatch: etag,
205+
IfMatch: ownership.ETag,
201206
},
202207
},
203208
})
@@ -207,29 +212,52 @@ func (b *BlobStore) setMetadata(ctx context.Context, blobName string, blobMetada
207212
}
208213

209214
return setMetadataResp.LastModified, *setMetadataResp.ETag, nil
210-
} else {
211-
setMetadataResp, err := blobClient.SetMetadata(ctx, blobMetadata, nil)
215+
}
212216

213-
if err == nil {
214-
return setMetadataResp.LastModified, *setMetadataResp.ETag, nil
215-
}
217+
uploadResp, err := blobClient.Upload(ctx, streaming.NopCloser(bytes.NewReader([]byte{})), &blockblob.UploadOptions{
218+
Metadata: blobMetadata,
219+
AccessConditions: &blob.AccessConditions{
220+
ModifiedAccessConditions: &blob.ModifiedAccessConditions{
221+
IfNoneMatch: to.Ptr(azcore.ETag("*")),
222+
},
223+
},
224+
})
216225

217-
if !bloberror.HasCode(err, bloberror.BlobNotFound) {
218-
return nil, "", err
219-
}
226+
if err != nil {
227+
return nil, "", err
228+
}
220229

221-
// in JS they check to see if the error is BlobNotFound. If it is, then they
222-
// do a full upload of a blob instead.
223-
uploadResp, err := blobClient.Upload(ctx, streaming.NopCloser(bytes.NewReader([]byte{})), &blockblob.UploadOptions{
224-
Metadata: blobMetadata,
225-
})
230+
return uploadResp.LastModified, *uploadResp.ETag, nil
231+
}
226232

227-
if err != nil {
228-
return nil, "", err
229-
}
233+
// setCheckpointMetadata sets the metadata for a checkpoint, falling back to creating
234+
// the blob if it doesn't already exist.
235+
//
236+
// NOTE: unlike [setOwnershipMetadata] this function doesn't attempt to prevent simultaneous
237+
// checkpoint updates - ownership is assumed.
238+
func (b *BlobStore) setCheckpointMetadata(ctx context.Context, blobName string, checkpoint azeventhubs.Checkpoint) (*time.Time, azcore.ETag, error) {
239+
blobMetadata := newCheckpointBlobMetadata(checkpoint)
240+
blobClient := b.cc.NewBlockBlobClient(blobName)
241+
242+
setMetadataResp, err := blobClient.SetMetadata(ctx, blobMetadata, nil)
243+
244+
if err == nil {
245+
return setMetadataResp.LastModified, *setMetadataResp.ETag, nil
246+
}
230247

231-
return uploadResp.LastModified, *uploadResp.ETag, nil
248+
if !bloberror.HasCode(err, bloberror.BlobNotFound) {
249+
return nil, "", err
232250
}
251+
252+
uploadResp, err := blobClient.Upload(ctx, streaming.NopCloser(bytes.NewReader([]byte{})), &blockblob.UploadOptions{
253+
Metadata: blobMetadata,
254+
})
255+
256+
if err != nil {
257+
return nil, "", err
258+
}
259+
260+
return uploadResp.LastModified, *uploadResp.ETag, nil
233261
}
234262

235263
func nameForCheckpointBlob(a azeventhubs.Checkpoint) (string, error) {

sdk/messaging/azeventhubs/checkpoints/blob_store_test.go

+121
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ package checkpoints_test
44

55
import (
66
"context"
7+
"fmt"
78
"os"
89
"strconv"
910
"testing"
@@ -216,6 +217,126 @@ func TestBlobStore_ListAndClaim(t *testing.T) {
216217
require.Empty(t, claimedOwnerships)
217218
}
218219

220+
func TestBlobStore_OnlyOneOwnershipClaimSucceeds(t *testing.T) {
221+
testData := getContainerClient(t)
222+
defer testData.Cleanup()
223+
224+
cc, err := container.NewClientFromConnectionString(testData.ConnectionString, testData.ContainerName, nil)
225+
require.NoError(t, err)
226+
227+
store, err := checkpoints.NewBlobStore(cc, nil)
228+
require.NoError(t, err)
229+
230+
// we're going to make multiple calls to the blob store but only _one_ should succeed
231+
// since it's "first one in wins"
232+
claimsCh := make(chan []azeventhubs.Ownership, 20)
233+
234+
t.Logf("Starting %d goroutines to claim ownership without an etag", cap(claimsCh))
235+
236+
// attempt to claim the same partition from multiple goroutines. Only _one_ of the
237+
// goroutines should walk away thinking it claimed the partition.
238+
for i := 0; i < cap(claimsCh); i++ {
239+
go func() {
240+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
241+
defer cancel()
242+
243+
ownerships, err := store.ClaimOwnership(ctx, []azeventhubs.Ownership{
244+
{ConsumerGroup: azeventhubs.DefaultConsumerGroup, EventHubName: "name", FullyQualifiedNamespace: "ns", PartitionID: "0", OwnerID: "ownerID"},
245+
}, nil)
246+
247+
if err != nil {
248+
claimsCh <- nil
249+
require.NoError(t, err)
250+
} else {
251+
claimsCh <- ownerships
252+
}
253+
}()
254+
}
255+
256+
claimed := map[string]bool{}
257+
numFailedClaims := 0
258+
259+
for i := 0; i < cap(claimsCh); i++ {
260+
claims := <-claimsCh
261+
262+
if claims == nil {
263+
numFailedClaims++
264+
continue
265+
}
266+
267+
for _, claim := range claims {
268+
require.False(t, claimed[claim.PartitionID], fmt.Sprintf("Partition ID %s was claimed more than once", claim.PartitionID))
269+
require.NotNil(t, claim.ETag)
270+
claimed[claim.PartitionID] = true
271+
}
272+
}
273+
274+
require.Equal(t, cap(claimsCh)-1, numFailedClaims, fmt.Sprintf("One of the 1/%d wins and the rest all fail to claim", cap(claimsCh)))
275+
}
276+
277+
func TestBlobStore_OnlyOneOwnershipUpdateSucceeds(t *testing.T) {
278+
testData := getContainerClient(t)
279+
defer testData.Cleanup()
280+
281+
cc, err := container.NewClientFromConnectionString(testData.ConnectionString, testData.ContainerName, nil)
282+
require.NoError(t, err)
283+
284+
store, err := checkpoints.NewBlobStore(cc, nil)
285+
require.NoError(t, err)
286+
287+
// we're going to make multiple calls to the blob store but only _one_ should succeed
288+
// since it's "first one in wins"
289+
claimsCh := make(chan []azeventhubs.Ownership, 20)
290+
291+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
292+
defer cancel()
293+
294+
ownerships, err := store.ClaimOwnership(ctx, []azeventhubs.Ownership{
295+
{ConsumerGroup: azeventhubs.DefaultConsumerGroup, EventHubName: "name", FullyQualifiedNamespace: "ns", PartitionID: "0", OwnerID: "ownerID"},
296+
}, nil)
297+
require.NoError(t, err)
298+
require.Equal(t, "0", ownerships[0].PartitionID)
299+
require.NotNil(t, ownerships[0].ETag)
300+
301+
t.Logf("Starting %d goroutines to claim ownership without an etag", cap(claimsCh))
302+
303+
// attempt to claim the same partition from multiple goroutines. Only _one_ of the
304+
// goroutines should walk away thinking it claimed the partition.
305+
for i := 0; i < cap(claimsCh); i++ {
306+
go func() {
307+
308+
ownerships, err := store.ClaimOwnership(ctx, ownerships, nil)
309+
310+
if err != nil {
311+
claimsCh <- nil
312+
require.NoError(t, err)
313+
} else {
314+
claimsCh <- ownerships
315+
}
316+
}()
317+
}
318+
319+
claimed := map[string]bool{}
320+
numFailedClaims := 0
321+
322+
for i := 0; i < cap(claimsCh); i++ {
323+
claims := <-claimsCh
324+
325+
if claims == nil {
326+
numFailedClaims++
327+
continue
328+
}
329+
330+
for _, claim := range claims {
331+
require.False(t, claimed[claim.PartitionID], fmt.Sprintf("Partition ID %s was claimed more than once", claim.PartitionID))
332+
require.NotNil(t, claim.ETag)
333+
claimed[claim.PartitionID] = true
334+
}
335+
}
336+
337+
require.Equal(t, cap(claimsCh)-1, numFailedClaims, fmt.Sprintf("One of the 1/%d wins and the rest all fail to claim", cap(claimsCh)))
338+
}
339+
219340
func getContainerClient(t *testing.T) struct {
220341
ConnectionString string
221342
ContainerName string

sdk/messaging/azeventhubs/internal/test/test_helpers.go

+7
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ func CaptureLogsForTest() func() []string {
4040

4141
func CaptureLogsForTestWithChannel(messagesCh chan string) func() []string {
4242
setAzLogListener(func(e azlog.Event, s string) {
43+
defer func() {
44+
if err := recover(); err != nil {
45+
fmt.Printf("FAILED SENDING MESSAGE (%s), message was: [%s] %s\n", err, e, s)
46+
panic(err)
47+
}
48+
}()
49+
4350
messagesCh <- fmt.Sprintf("[%s] %s", e, s)
4451
})
4552

0 commit comments

Comments
 (0)