Skip to content

Commit 4626a6a

Browse files
committed
🐛: ec2/byoip: fix EIP leak when creating machine
The instance creation flow is creating by default EIP to instances even if the BYO IP flow is set. BYO IPv4 creates and associates the EIP to instance after it is created, preventing paying for additional EIP (amazon-provided) when creating the instance when the BYO IPv4 Pool is defined to be used by the machine. Furthermore, the fix provides additional checks to prevent duplicated EIP in the BYO IP reconciliation loop. The extra checks include running the EIP association many times, while the EIP is already associated, and failures in the log when running the EIP association prematurely - when the instance isn't ready, Eg ec2 in pending state.
1 parent 686a3f5 commit 4626a6a

File tree

8 files changed

+123
-57
lines changed

8 files changed

+123
-57
lines changed

controllers/awsmachine_controller.go

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -533,17 +533,20 @@ func (r *AWSMachineReconciler) reconcileNormal(_ context.Context, machineScope *
533533
}
534534

535535
// BYO Public IPv4 Pool feature: allocates and associates an EIP to machine when PublicIP and
536-
// cluster-wide Public IPv4 Pool configuration are set. The EIP must be associated after the instance
537-
// is created and transictioned from Pending state.
538-
// In the regular flow, if the instance have already a public IPv4 address (EIP) associated it will
539-
// be released when a new is assigned, the createInstance() prevents that behavior by enforcing
540-
// to not launch an instance with EIP, allowing ReconcileElasticIPFromPublicPool assigning
541-
// a BYOIP without duplication.
536+
// cluster-wide config Public IPv4 Pool configuration are set. The custom EIP is associated
537+
// after the instance is created and transictioned to Running state.
538+
// The CreateInstance() is enforcing to not assign public IP address when PublicIP is set with
539+
// BYOIpv4 Pool, preventing a duplicated EIP creation.
542540
if pool := machineScope.GetElasticIPPool(); pool != nil {
543-
if err := ec2svc.ReconcileElasticIPFromPublicPool(pool, instance); err != nil {
544-
machineScope.Error(err, "failed to associate elastic IP address")
541+
requeue, err := ec2svc.ReconcileElasticIPFromPublicPool(pool, instance)
542+
if err != nil {
543+
machineScope.Error(err, "Failed to reconcile BYO Public IPv4")
545544
return ctrl.Result{}, err
546545
}
546+
if requeue {
547+
machineScope.Debug("Found instance in pending state while reconciling publicIpv4Pool, requeue", "instance", instance.ID)
548+
return ctrl.Result{RequeueAfter: DefaultReconcilerRequeue}, nil
549+
}
547550
}
548551

549552
if feature.Gates.Enabled(feature.EventBridgeInstanceState) {

controllers/awsmachine_controller_unit_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1023,7 +1023,7 @@ func TestAWSMachineReconciler(t *testing.T) {
10231023
PublicIpv4Pool: aws.String("ipv4pool-ec2-0123456789abcdef0"),
10241024
PublicIpv4PoolFallBackOrder: ptr.To(infrav1.PublicIpv4PoolFallbackOrderAmazonPool),
10251025
}
1026-
ec2Svc.EXPECT().ReconcileElasticIPFromPublicPool(gomock.Any(), gomock.Any()).Return(nil).AnyTimes()
1026+
ec2Svc.EXPECT().ReconcileElasticIPFromPublicPool(gomock.Any(), gomock.Any()).Return(false, nil).AnyTimes()
10271027

10281028
_, err := reconciler.reconcileNormal(context.Background(), ms, cs, cs, cs, cs)
10291029
g.Expect(err).To(BeNil())

pkg/cloud/services/ec2/eip.go

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66

77
"github.com/aws/aws-sdk-go/aws"
88
"github.com/aws/aws-sdk-go/service/ec2"
9+
"k8s.io/utils/ptr"
910

1011
infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2"
1112
"sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record"
@@ -16,17 +17,51 @@ func getElasticIPRoleName(instanceID string) string {
1617
}
1718

1819
// ReconcileElasticIPFromPublicPool reconciles the elastic IP from a custom Public IPv4 Pool.
19-
func (s *Service) ReconcileElasticIPFromPublicPool(pool *infrav1.ElasticIPPool, instance *infrav1.Instance) error {
20-
// TODO: check if the instance is in the state allowing EIP association.
21-
// Expected instance states: pending or running
22-
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-lifecycle.html
20+
func (s *Service) ReconcileElasticIPFromPublicPool(pool *infrav1.ElasticIPPool, instance *infrav1.Instance) (bool, error) {
21+
shouldRequeue := true
22+
// Should not happen
23+
if pool == nil {
24+
return shouldRequeue, fmt.Errorf("unexpected behavior, pool must be set when reconcile ElasticIPPool")
25+
}
26+
iip := ptr.Deref(instance.PublicIP, "")
27+
s.scope.Debug("Reconciling machine with custom Public IPv4 Pool", "instance-id", instance.ID, "instance-state", instance.State, "instance-public-ip", iip, "publicIpv4PoolID", pool.PublicIpv4Pool)
28+
29+
// Requeue when the instance is not ready to be associated.
30+
if instance.State != infrav1.InstanceStateRunning {
31+
s.scope.Debug("Unable to reconcile Elastic IP Pool for instance", "instance-id", instance.ID, "instance-state", instance.State)
32+
return shouldRequeue, nil
33+
}
34+
35+
// All done, must reconcile only when the instance is in running state.
36+
shouldRequeue = false
37+
38+
// Prevent running association every reconciliation when it is already done.
39+
addrs, err := s.netService.GetAddresses(getElasticIPRoleName(instance.ID))
40+
if err != nil {
41+
s.scope.Error(err, "error checking if addresses exists for Elastic IP Pool to machine", "eip-role", getElasticIPRoleName(instance.ID))
42+
return shouldRequeue, err
43+
}
44+
if len(addrs.Addresses) > 0 {
45+
if len(addrs.Addresses) != 1 {
46+
return shouldRequeue, fmt.Errorf("unexpected number of EIPs allocated to the role. expected 1, got %d", len(addrs.Addresses))
47+
}
48+
addr := addrs.Addresses[0]
49+
// address is already associated.
50+
if addr.AssociationId != nil && addr.InstanceId != nil && *addr.InstanceId == instance.ID {
51+
s.scope.Debug("Machine is already associated with an Elastic IP with custom Public IPv4 pool", "eip", addr.AllocationId, "eip-address", addr.PublicIp, "eip-associationID", addr.AssociationId, "eip-instance", addr.InstanceId)
52+
return shouldRequeue, nil
53+
}
54+
}
55+
56+
// Get existing, or allocate an EIP, then Associate to the machine.
57+
// Should requeue if any error is returned in the process.
2358
if err := s.getAndAssociateAddressesToInstance(pool, getElasticIPRoleName(instance.ID), instance.ID); err != nil {
24-
return fmt.Errorf("failed to reconcile Elastic IP: %w", err)
59+
return true, fmt.Errorf("failed to reconcile Elastic IP: %w", err)
2560
}
26-
return nil
61+
return shouldRequeue, nil
2762
}
2863

29-
// ReleaseElasticIP releases a specific elastic IP based on the instance role.
64+
// ReleaseElasticIP releases a specific Elastic IP based on the instance role.
3065
func (s *Service) ReleaseElasticIP(instanceID string) error {
3166
return s.netService.ReleaseAddressByRole(getElasticIPRoleName(instanceID))
3267
}

pkg/cloud/services/ec2/instances.go

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -572,21 +572,13 @@ func (s *Service) runInstance(role string, i *infrav1.Instance) (*infrav1.Instan
572572

573573
input.NetworkInterfaces = netInterfaces
574574
} else {
575-
if ptr.Deref(i.PublicIPOnLaunch, false) {
576-
input.NetworkInterfaces = []*ec2.InstanceNetworkInterfaceSpecification{
577-
{
578-
DeviceIndex: aws.Int64(0),
579-
SubnetId: aws.String(i.SubnetID),
580-
Groups: aws.StringSlice(i.SecurityGroupIDs),
581-
AssociatePublicIpAddress: i.PublicIPOnLaunch,
582-
},
583-
}
584-
} else {
585-
input.SubnetId = aws.String(i.SubnetID)
586-
587-
if len(i.SecurityGroupIDs) > 0 {
588-
input.SecurityGroupIds = aws.StringSlice(i.SecurityGroupIDs)
589-
}
575+
input.NetworkInterfaces = []*ec2.InstanceNetworkInterfaceSpecification{
576+
{
577+
DeviceIndex: aws.Int64(0),
578+
SubnetId: aws.String(i.SubnetID),
579+
Groups: aws.StringSlice(i.SecurityGroupIDs),
580+
AssociatePublicIpAddress: i.PublicIPOnLaunch,
581+
},
590582
}
591583
}
592584

pkg/cloud/services/ec2/instances_test.go

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -672,11 +672,16 @@ func TestCreateInstance(t *testing.T) {
672672
}, nil)
673673
m.
674674
RunInstancesWithContext(context.TODO(), &ec2.RunInstancesInput{
675-
ImageId: aws.String("abc"),
676-
InstanceType: aws.String("m5.2xlarge"),
677-
KeyName: aws.String("default"),
678-
SecurityGroupIds: aws.StringSlice([]string{"2", "3"}),
679-
SubnetId: aws.String("subnet-3"),
675+
ImageId: aws.String("abc"),
676+
InstanceType: aws.String("m5.2xlarge"),
677+
KeyName: aws.String("default"),
678+
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
679+
{
680+
DeviceIndex: aws.Int64(0),
681+
SubnetId: aws.String("subnet-3"),
682+
Groups: aws.StringSlice([]string{"2", "3"}),
683+
},
684+
},
680685
TagSpecifications: []*ec2.TagSpecification{
681686
{
682687
ResourceType: aws.String("instance"),
@@ -920,11 +925,16 @@ func TestCreateInstance(t *testing.T) {
920925
}, nil)
921926
m.
922927
RunInstancesWithContext(context.TODO(), &ec2.RunInstancesInput{
923-
ImageId: aws.String("abc"),
924-
InstanceType: aws.String("m5.2xlarge"),
925-
KeyName: aws.String("default"),
926-
SecurityGroupIds: aws.StringSlice([]string{"4", "3"}),
927-
SubnetId: aws.String("subnet-5"),
928+
ImageId: aws.String("abc"),
929+
InstanceType: aws.String("m5.2xlarge"),
930+
KeyName: aws.String("default"),
931+
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
932+
{
933+
DeviceIndex: aws.Int64(0),
934+
SubnetId: aws.String("subnet-5"),
935+
Groups: aws.StringSlice([]string{"4", "3"}),
936+
},
937+
},
928938
TagSpecifications: []*ec2.TagSpecification{
929939
{
930940
ResourceType: aws.String("instance"),
@@ -3346,8 +3356,13 @@ func TestCreateInstance(t *testing.T) {
33463356
Placement: &ec2.Placement{
33473357
Tenancy: &tenancy,
33483358
},
3349-
SecurityGroupIds: []*string{aws.String("2"), aws.String("3")},
3350-
SubnetId: aws.String("subnet-1"),
3359+
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
3360+
{
3361+
DeviceIndex: aws.Int64(0),
3362+
SubnetId: aws.String("subnet-1"),
3363+
Groups: []*string{aws.String("2"), aws.String("3")},
3364+
},
3365+
},
33513366
TagSpecifications: []*ec2.TagSpecification{
33523367
{
33533368
ResourceType: aws.String("instance"),
@@ -3555,8 +3570,13 @@ func TestCreateInstance(t *testing.T) {
35553570
Placement: &ec2.Placement{
35563571
GroupName: aws.String("placement-group1"),
35573572
},
3558-
SecurityGroupIds: []*string{aws.String("2"), aws.String("3")},
3559-
SubnetId: aws.String("subnet-1"),
3573+
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
3574+
{
3575+
DeviceIndex: aws.Int64(0),
3576+
SubnetId: aws.String("subnet-1"),
3577+
Groups: []*string{aws.String("2"), aws.String("3")},
3578+
},
3579+
},
35603580
TagSpecifications: []*ec2.TagSpecification{
35613581
{
35623582
ResourceType: aws.String("instance"),
@@ -3785,8 +3805,13 @@ func TestCreateInstance(t *testing.T) {
37853805
Tenancy: &tenancy,
37863806
GroupName: aws.String("placement-group1"),
37873807
},
3788-
SecurityGroupIds: []*string{aws.String("2"), aws.String("3")},
3789-
SubnetId: aws.String("subnet-1"),
3808+
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
3809+
{
3810+
DeviceIndex: aws.Int64(0),
3811+
SubnetId: aws.String("subnet-1"),
3812+
Groups: []*string{aws.String("2"), aws.String("3")},
3813+
},
3814+
},
37903815
TagSpecifications: []*ec2.TagSpecification{
37913816
{
37923817
ResourceType: aws.String("instance"),
@@ -3976,8 +4001,13 @@ func TestCreateInstance(t *testing.T) {
39764001
GroupName: aws.String("placement-group1"),
39774002
PartitionNumber: aws.Int64(2),
39784003
},
3979-
SecurityGroupIds: []*string{aws.String("2"), aws.String("3")},
3980-
SubnetId: aws.String("subnet-1"),
4004+
NetworkInterfaces: []*ec2.InstanceNetworkInterfaceSpecification{
4005+
{
4006+
DeviceIndex: aws.Int64(0),
4007+
SubnetId: aws.String("subnet-1"),
4008+
Groups: aws.StringSlice([]string{"2", "3"}),
4009+
},
4010+
},
39814011
TagSpecifications: []*ec2.TagSpecification{
39824012
{
39834013
ResourceType: aws.String("instance"),

pkg/cloud/services/interfaces.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ type EC2Interface interface {
8282
DeleteBastion() error
8383
ReconcileBastion() error
8484
// ReconcileElasticIPFromPublicPool reconciles the elastic IP from a custom Public IPv4 Pool.
85-
ReconcileElasticIPFromPublicPool(pool *infrav1.ElasticIPPool, instance *infrav1.Instance) error
85+
ReconcileElasticIPFromPublicPool(pool *infrav1.ElasticIPPool, instance *infrav1.Instance) (bool, error)
8686

8787
// ReleaseElasticIP reconciles the elastic IP from a custom Public IPv4 Pool.
8888
ReleaseElasticIP(instanceID string) error

pkg/cloud/services/mock_services/ec2_interface_mock.go

Lines changed: 4 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/cloud/services/network/eips.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -191,12 +191,17 @@ func (s *Service) GetOrAllocateAddresses(pool *infrav1.ElasticIPPool, num int, r
191191
return s.getOrAllocateAddresses(num, role, pool)
192192
}
193193

194+
// GetAddresses returns the address associated to a given role.
195+
func (s *Service) GetAddresses(role string) (*ec2.DescribeAddressesOutput, error) {
196+
return s.describeAddresses(role)
197+
}
198+
194199
// ReleaseAddressByRole releases EIP addresses filtering by tag CAPA provider role.
195200
func (s *Service) ReleaseAddressByRole(role string) error {
196-
clusterFilter := []*ec2.Filter{filter.EC2.Cluster(s.scope.Name())}
197-
clusterFilter = append(clusterFilter, filter.EC2.ProviderRole(role))
198-
199-
return s.releaseAddressesWithFilter(clusterFilter)
201+
return s.releaseAddressesWithFilter([]*ec2.Filter{
202+
filter.EC2.ClusterOwned(s.scope.Name()),
203+
filter.EC2.ProviderRole(role),
204+
})
200205
}
201206

202207
// setByoPublicIpv4 check if the config has Public IPv4 Pool defined, then

0 commit comments

Comments
 (0)