github.com/openshift/installer@v1.4.17/pkg/infrastructure/aws/clusterapi/aws.go (about) 1 package clusterapi 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "strings" 8 "time" 9 10 "github.com/aws/aws-sdk-go/aws" 11 "github.com/aws/aws-sdk-go/aws/awserr" 12 "github.com/aws/aws-sdk-go/aws/session" 13 "github.com/aws/aws-sdk-go/service/ec2" 14 "github.com/aws/aws-sdk-go/service/elbv2" 15 "github.com/aws/aws-sdk-go/service/s3" 16 "github.com/aws/aws-sdk-go/service/s3/s3manager" 17 "github.com/sirupsen/logrus" 18 k8serrors "k8s.io/apimachinery/pkg/api/errors" 19 "k8s.io/apimachinery/pkg/util/wait" 20 "k8s.io/utils/ptr" 21 capa "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" 22 k8sClient "sigs.k8s.io/controller-runtime/pkg/client" 23 24 awsconfig "github.com/openshift/installer/pkg/asset/installconfig/aws" 25 awsmanifest "github.com/openshift/installer/pkg/asset/manifests/aws" 26 "github.com/openshift/installer/pkg/asset/manifests/capiutils" 27 "github.com/openshift/installer/pkg/infrastructure/clusterapi" 28 awstypes "github.com/openshift/installer/pkg/types/aws" 29 ) 30 31 var ( 32 _ clusterapi.Provider = (*Provider)(nil) 33 _ clusterapi.PreProvider = (*Provider)(nil) 34 _ clusterapi.InfraReadyProvider = (*Provider)(nil) 35 _ clusterapi.BootstrapDestroyer = (*Provider)(nil) 36 _ clusterapi.PostDestroyer = (*Provider)(nil) 37 38 errNotFound = errors.New("not found") 39 ) 40 41 // Provider implements AWS CAPI installation. 42 type Provider struct { 43 bestEffortDeleteIgnition bool 44 } 45 46 // Name gives the name of the provider, AWS. 47 func (*Provider) Name() string { return awstypes.Name } 48 49 // PublicGatherEndpoint indicates that machine ready checks should wait for an ExternalIP 50 // in the status and use that when gathering bootstrap log bundles. 51 func (*Provider) PublicGatherEndpoint() clusterapi.GatherEndpoint { return clusterapi.ExternalIP } 52 53 // PreProvision creates the IAM roles used by all nodes in the cluster. 54 func (*Provider) PreProvision(ctx context.Context, in clusterapi.PreProvisionInput) error { 55 if err := createIAMRoles(ctx, in.InfraID, in.InstallConfig); err != nil { 56 return fmt.Errorf("failed to create IAM roles: %w", err) 57 } 58 59 // The AWSMachine manifests might already have the AMI ID set from the machine pool which takes into account the 60 // ways in which the AMI can be specified: the default rhcos if already in the target region, a custom AMI ID set in 61 // platform.aws.amiID, and a custom AMI ID specified in the controlPlane stanza. So we just get the value from the 62 // first awsmachine manifest we find, instead of duplicating all the inheriting logic here. 63 for i := range in.MachineManifests { 64 if awsMachine, ok := in.MachineManifests[i].(*capa.AWSMachine); ok { 65 // Default/custom AMI already in target region, nothing else to do 66 if ptr.Deref(awsMachine.Spec.AMI.ID, "") != "" { 67 return nil 68 } 69 } 70 } 71 72 // Notice that we have to use the default RHCOS value because we set the AMI.ID to empty if the default RHCOS is not 73 // in the target region and it needs to be copied over. See pkg/asset/machines/clusterapi.go 74 amiID, err := copyAMIToRegion(ctx, in.InstallConfig, in.InfraID, in.RhcosImage) 75 if err != nil { 76 return fmt.Errorf("failed to copy AMI: %w", err) 77 } 78 // Update manifests with the new ID 79 for i := range in.MachineManifests { 80 if awsMachine, ok := in.MachineManifests[i].(*capa.AWSMachine); ok { 81 awsMachine.Spec.AMI.ID = ptr.To(amiID) 82 } 83 } 84 return nil 85 } 86 87 // InfraReady creates private hosted zone and DNS records. 88 func (*Provider) InfraReady(ctx context.Context, in clusterapi.InfraReadyInput) error { 89 awsCluster := &capa.AWSCluster{} 90 key := k8sClient.ObjectKey{ 91 Name: in.InfraID, 92 Namespace: capiutils.Namespace, 93 } 94 if err := in.Client.Get(ctx, key, awsCluster); err != nil { 95 return fmt.Errorf("failed to get AWSCluster: %w", err) 96 } 97 98 awsSession, err := in.InstallConfig.AWS.Session(ctx) 99 if err != nil { 100 return fmt.Errorf("failed to get aws session: %w", err) 101 } 102 103 subnetIDs := make([]string, 0, len(awsCluster.Spec.NetworkSpec.Subnets)) 104 for _, s := range awsCluster.Spec.NetworkSpec.Subnets { 105 subnetIDs = append(subnetIDs, s.ResourceID) 106 } 107 108 vpcID := awsCluster.Spec.NetworkSpec.VPC.ID 109 if len(subnetIDs) > 0 && len(vpcID) == 0 { 110 // All subnets belong to the same VPC, so we only need one 111 vpcID, err = getVPCFromSubnets(ctx, awsSession, awsCluster.Spec.Region, subnetIDs[:1]) 112 if err != nil { 113 return err 114 } 115 } 116 117 tags := map[string]string{ 118 fmt.Sprintf("kubernetes.io/cluster/%s", in.InfraID): "owned", 119 } 120 for k, v := range awsCluster.Spec.AdditionalTags { 121 tags[k] = v 122 } 123 124 client := awsconfig.NewClient(awsSession) 125 126 phzID := in.InstallConfig.Config.AWS.HostedZone 127 if len(phzID) == 0 { 128 logrus.Infoln("Creating private Hosted Zone") 129 res, err := client.CreateHostedZone(ctx, &awsconfig.HostedZoneInput{ 130 InfraID: in.InfraID, 131 VpcID: vpcID, 132 Region: awsCluster.Spec.Region, 133 Name: in.InstallConfig.Config.ClusterDomain(), 134 Role: in.InstallConfig.Config.AWS.HostedZoneRole, 135 UserTags: tags, 136 }) 137 if err != nil { 138 return fmt.Errorf("failed to create private hosted zone: %w", err) 139 } 140 phzID = aws.StringValue(res.Id) 141 } 142 143 logrus.Infoln("Creating Route53 records for control plane load balancer") 144 aliasZoneID, err := getHostedZoneIDForNLB(ctx, awsSession, awsCluster.Spec.Region, awsCluster.Status.Network.APIServerELB.Name) 145 if err != nil { 146 return fmt.Errorf("failed to find HostedZone ID for NLB: %w", err) 147 } 148 apiHost := awsCluster.Status.Network.SecondaryAPIServerELB.DNSName 149 if awsCluster.Status.Network.APIServerELB.Scheme == capa.ELBSchemeInternetFacing { 150 apiHost = awsCluster.Status.Network.APIServerELB.DNSName 151 } 152 apiIntHost := awsCluster.Spec.ControlPlaneEndpoint.Host 153 err = client.CreateOrUpdateRecord(ctx, in.InstallConfig.Config, apiHost, apiIntHost, phzID, aliasZoneID) 154 if err != nil { 155 return fmt.Errorf("failed to create route53 records: %w", err) 156 } 157 158 return nil 159 } 160 161 func getVPCFromSubnets(ctx context.Context, awsSession *session.Session, region string, subnetIDs []string) (string, error) { 162 var vpcID string 163 var lastError error 164 client := ec2.New(awsSession, aws.NewConfig().WithRegion(region)) 165 err := client.DescribeSubnetsPagesWithContext( 166 ctx, 167 &ec2.DescribeSubnetsInput{SubnetIds: aws.StringSlice(subnetIDs)}, 168 func(results *ec2.DescribeSubnetsOutput, lastPage bool) bool { 169 for _, subnet := range results.Subnets { 170 if subnet.SubnetId == nil { 171 continue 172 } 173 if subnet.SubnetArn == nil { 174 lastError = fmt.Errorf("%s has no ARN", *subnet.SubnetId) 175 return false 176 } 177 if subnet.VpcId == nil { 178 lastError = fmt.Errorf("%s has no VPC", *subnet.SubnetId) 179 return false 180 } 181 if subnet.AvailabilityZone == nil { 182 lastError = fmt.Errorf("%s has no availability zone", *subnet.SubnetId) 183 return false 184 } 185 // All subnets belong to the same VPC 186 vpcID = aws.StringValue(subnet.VpcId) 187 lastError = nil 188 return true 189 } 190 return !lastPage 191 }, 192 ) 193 if err == nil { 194 err = lastError 195 } 196 if err != nil { 197 return "", fmt.Errorf("failed to get VPC from subnets: %w", err) 198 } 199 200 return vpcID, nil 201 } 202 203 // getHostedZoneIDForNLB returns the HostedZone ID for a region from a known table or queries it from the LB instead. 204 func getHostedZoneIDForNLB(ctx context.Context, awsSession *session.Session, region string, lbName string) (string, error) { 205 if hzID, ok := awsconfig.HostedZoneIDPerRegionNLBMap[region]; ok { 206 return hzID, nil 207 } 208 // If the HostedZoneID is not known, query from the LoadBalancer 209 input := elbv2.DescribeLoadBalancersInput{ 210 Names: aws.StringSlice([]string{lbName}), 211 } 212 res, err := elbv2.New(awsSession).DescribeLoadBalancersWithContext(ctx, &input) 213 if err != nil { 214 var awsErr awserr.Error 215 if errors.As(err, &awsErr) && awsErr.Code() == elbv2.ErrCodeLoadBalancerNotFoundException { 216 return "", errNotFound 217 } 218 return "", fmt.Errorf("failed to list load balancers: %w", err) 219 } 220 for _, lb := range res.LoadBalancers { 221 return *lb.CanonicalHostedZoneId, nil 222 } 223 224 return "", errNotFound 225 } 226 227 // DestroyBootstrap removes aws bootstrap resources not handled 228 // by the deletion of the bootstrap machine by the capi controllers. 229 func (p *Provider) DestroyBootstrap(ctx context.Context, in clusterapi.BootstrapDestroyInput) error { 230 awsCluster := &capa.AWSCluster{} 231 key := k8sClient.ObjectKey{ 232 Name: in.Metadata.InfraID, 233 Namespace: capiutils.Namespace, 234 } 235 if err := in.Client.Get(ctx, key, awsCluster); err != nil { 236 return fmt.Errorf("failed to get AWSCluster: %w", err) 237 } 238 239 // Save this value for use in the post-destroy hook since we don't have capi running anymore by that point. 240 p.bestEffortDeleteIgnition = ptr.Deref(awsCluster.Spec.S3Bucket.BestEffortDeleteObjects, false) 241 242 var sgID string 243 if sg, ok := awsCluster.Status.Network.SecurityGroups[capa.SecurityGroupControlPlane]; ok && len(sg.ID) > 0 { 244 sgID = sg.ID 245 } else if ok { 246 return fmt.Errorf("control plane security group id is not populated in awscluster status") 247 } else { 248 keys := make([]capa.SecurityGroupRole, 0, len(awsCluster.Status.Network.SecurityGroups)) 249 for sgr := range awsCluster.Status.Network.SecurityGroups { 250 keys = append(keys, sgr) 251 } 252 return fmt.Errorf("controlplane not found in cluster security groups: %v", keys) 253 } 254 255 region := in.Metadata.ClusterPlatformMetadata.AWS.Region 256 session, err := awsconfig.GetSessionWithOptions( 257 awsconfig.WithRegion(region), 258 awsconfig.WithServiceEndpoints(region, in.Metadata.ClusterPlatformMetadata.AWS.ServiceEndpoints), 259 ) 260 if err != nil { 261 return fmt.Errorf("failed to create aws session: %w", err) 262 } 263 264 timeout := 15 * time.Minute 265 startTime := time.Now() 266 untilTime := startTime.Add(timeout) 267 timezone, _ := untilTime.Zone() 268 logrus.Debugf("Waiting up to %v (until %v %s) for bootstrap SSH rule to be destroyed...", timeout, untilTime.Format(time.Kitchen), timezone) 269 if err := wait.PollUntilContextTimeout(ctx, 15*time.Second, timeout, true, 270 func(ctx context.Context) (bool, error) { 271 if err := removeSSHRule(ctx, in.Client, in.Metadata.InfraID); err != nil { 272 // If the cluster object has been modified between Get and Update, k8s client will refuse to update it. 273 // In that case, we need to retry. 274 if k8serrors.IsConflict(err) { 275 logrus.Debugf("AWSCluster update conflict during SSH rule removal: %v", err) 276 return false, nil 277 } 278 return true, fmt.Errorf("failed to remove bootstrap SSH rule: %w", err) 279 } 280 return isSSHRuleGone(ctx, session, region, sgID) 281 }, 282 ); err != nil { 283 if wait.Interrupted(err) { 284 return fmt.Errorf("bootstrap ssh rule was not removed within %v: %w", timeout, err) 285 } 286 return fmt.Errorf("unable to remove bootstrap ssh rule: %w", err) 287 } 288 logrus.Debugf("Completed removing bootstrap SSH rule after %v", time.Since(startTime)) 289 290 return nil 291 } 292 293 // removeSSHRule removes the SSH rule for accessing the bootstrap node 294 // by removing the rule from the cluster spec and updating the object. 295 func removeSSHRule(ctx context.Context, cl k8sClient.Client, infraID string) error { 296 awsCluster := &capa.AWSCluster{} 297 key := k8sClient.ObjectKey{ 298 Name: infraID, 299 Namespace: capiutils.Namespace, 300 } 301 if err := cl.Get(ctx, key, awsCluster); err != nil { 302 return fmt.Errorf("failed to get AWSCluster: %w", err) 303 } 304 305 postBootstrapRules := []capa.IngressRule{} 306 for _, rule := range awsCluster.Spec.NetworkSpec.AdditionalControlPlaneIngressRules { 307 if strings.EqualFold(rule.Description, awsmanifest.BootstrapSSHDescription) { 308 continue 309 } 310 postBootstrapRules = append(postBootstrapRules, rule) 311 } 312 313 // The spec has not been updated yet 314 if len(postBootstrapRules) < len(awsCluster.Spec.NetworkSpec.AdditionalControlPlaneIngressRules) { 315 awsCluster.Spec.NetworkSpec.AdditionalControlPlaneIngressRules = postBootstrapRules 316 317 if err := cl.Update(ctx, awsCluster); err != nil { 318 return fmt.Errorf("failed to update AWSCluster during bootstrap destroy: %w", err) 319 } 320 logrus.Debug("Updated AWSCluster to remove bootstrap SSH rule") 321 } 322 323 return nil 324 } 325 326 // isSSHRuleGone checks that the Public SSH rule has been removed from the security group. 327 func isSSHRuleGone(ctx context.Context, session *session.Session, region, sgID string) (bool, error) { 328 sgs, err := awsconfig.DescribeSecurityGroups(ctx, session, []string{sgID}, region) 329 if err != nil { 330 return false, fmt.Errorf("error getting security group: %w", err) 331 } 332 333 if len(sgs) != 1 { 334 ids := []string{} 335 for _, sg := range sgs { 336 ids = append(ids, *sg.GroupId) 337 } 338 return false, fmt.Errorf("expected exactly one security group with id %s, but got %v", sgID, ids) 339 } 340 341 sg := sgs[0] 342 for _, rule := range sg.IpPermissions { 343 if ptr.Deref(rule.ToPort, 0) != 22 { 344 continue 345 } 346 for _, source := range rule.IpRanges { 347 if source.CidrIp != nil && *source.CidrIp == "0.0.0.0/0" { 348 ruleDesc := ptr.Deref(source.Description, "[no description]") 349 logrus.Debugf("Found ingress rule %s with source cidr %s. Still waiting for deletion...", ruleDesc, *source.CidrIp) 350 return false, nil 351 } 352 } 353 } 354 355 return true, nil 356 } 357 358 // PostDestroy deletes the ignition bucket after capi stopped running, so it won't try to reconcile the bucket. 359 func (p *Provider) PostDestroy(ctx context.Context, in clusterapi.PostDestroyerInput) error { 360 region := in.Metadata.AWS.Region 361 session, err := awsconfig.GetSessionWithOptions( 362 awsconfig.WithRegion(region), 363 awsconfig.WithServiceEndpoints(region, in.Metadata.AWS.ServiceEndpoints), 364 ) 365 if err != nil { 366 return fmt.Errorf("failed to create aws session: %w", err) 367 } 368 369 bucketName := awsmanifest.GetIgnitionBucketName(in.Metadata.InfraID) 370 if err := removeS3Bucket(ctx, session, bucketName); err != nil { 371 if p.bestEffortDeleteIgnition { 372 logrus.Warnf("failed to delete ignition bucket %s: %v", bucketName, err) 373 return nil 374 } 375 return fmt.Errorf("failed to delete ignition bucket %s: %w", bucketName, err) 376 } 377 378 return nil 379 } 380 381 // removeS3Bucket deletes an s3 bucket given its name. 382 func removeS3Bucket(ctx context.Context, session *session.Session, bucketName string) error { 383 client := s3.New(session) 384 385 iter := s3manager.NewDeleteListIterator(client, &s3.ListObjectsInput{ 386 Bucket: aws.String(bucketName), 387 }) 388 err := s3manager.NewBatchDeleteWithClient(client).Delete(ctx, iter) 389 if err != nil && !isBucketNotFound(err) { 390 return err 391 } 392 logrus.Debugf("bucket %q emptied", bucketName) 393 394 if _, err := client.DeleteBucketWithContext(ctx, &s3.DeleteBucketInput{Bucket: aws.String(bucketName)}); err != nil { 395 if isBucketNotFound(err) { 396 logrus.Debugf("bucket %q already deleted", bucketName) 397 return nil 398 } 399 return err 400 } 401 return nil 402 } 403 404 func isBucketNotFound(err interface{}) bool { 405 switch s3Err := err.(type) { 406 case awserr.Error: 407 if s3Err.Code() == s3.ErrCodeNoSuchBucket { 408 return true 409 } 410 origErr := s3Err.OrigErr() 411 if origErr != nil { 412 return isBucketNotFound(origErr) 413 } 414 case s3manager.Error: 415 if s3Err.OrigErr != nil { 416 return isBucketNotFound(s3Err.OrigErr) 417 } 418 case s3manager.Errors: 419 if len(s3Err) == 1 { 420 return isBucketNotFound(s3Err[0]) 421 } 422 } 423 return false 424 }