github.com/openshift/installer@v1.4.17/pkg/infrastructure/aws/clusterapi/aws.go (about)

     1  package clusterapi
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"strings"
     8  	"time"
     9  
    10  	"github.com/aws/aws-sdk-go/aws"
    11  	"github.com/aws/aws-sdk-go/aws/awserr"
    12  	"github.com/aws/aws-sdk-go/aws/session"
    13  	"github.com/aws/aws-sdk-go/service/ec2"
    14  	"github.com/aws/aws-sdk-go/service/elbv2"
    15  	"github.com/aws/aws-sdk-go/service/s3"
    16  	"github.com/aws/aws-sdk-go/service/s3/s3manager"
    17  	"github.com/sirupsen/logrus"
    18  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    19  	"k8s.io/apimachinery/pkg/util/wait"
    20  	"k8s.io/utils/ptr"
    21  	capa "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2"
    22  	k8sClient "sigs.k8s.io/controller-runtime/pkg/client"
    23  
    24  	awsconfig "github.com/openshift/installer/pkg/asset/installconfig/aws"
    25  	awsmanifest "github.com/openshift/installer/pkg/asset/manifests/aws"
    26  	"github.com/openshift/installer/pkg/asset/manifests/capiutils"
    27  	"github.com/openshift/installer/pkg/infrastructure/clusterapi"
    28  	awstypes "github.com/openshift/installer/pkg/types/aws"
    29  )
    30  
    31  var (
    32  	_ clusterapi.Provider           = (*Provider)(nil)
    33  	_ clusterapi.PreProvider        = (*Provider)(nil)
    34  	_ clusterapi.InfraReadyProvider = (*Provider)(nil)
    35  	_ clusterapi.BootstrapDestroyer = (*Provider)(nil)
    36  	_ clusterapi.PostDestroyer      = (*Provider)(nil)
    37  
    38  	errNotFound = errors.New("not found")
    39  )
    40  
    41  // Provider implements AWS CAPI installation.
    42  type Provider struct {
    43  	bestEffortDeleteIgnition bool
    44  }
    45  
    46  // Name gives the name of the provider, AWS.
    47  func (*Provider) Name() string { return awstypes.Name }
    48  
    49  // PublicGatherEndpoint indicates that machine ready checks should wait for an ExternalIP
    50  // in the status and use that when gathering bootstrap log bundles.
    51  func (*Provider) PublicGatherEndpoint() clusterapi.GatherEndpoint { return clusterapi.ExternalIP }
    52  
    53  // PreProvision creates the IAM roles used by all nodes in the cluster.
    54  func (*Provider) PreProvision(ctx context.Context, in clusterapi.PreProvisionInput) error {
    55  	if err := createIAMRoles(ctx, in.InfraID, in.InstallConfig); err != nil {
    56  		return fmt.Errorf("failed to create IAM roles: %w", err)
    57  	}
    58  
    59  	// The AWSMachine manifests might already have the AMI ID set from the machine pool which takes into account the
    60  	// ways in which the AMI can be specified: the default rhcos if already in the target region, a custom AMI ID set in
    61  	// platform.aws.amiID, and a custom AMI ID specified in the controlPlane stanza. So we just get the value from the
    62  	// first awsmachine manifest we find, instead of duplicating all the inheriting logic here.
    63  	for i := range in.MachineManifests {
    64  		if awsMachine, ok := in.MachineManifests[i].(*capa.AWSMachine); ok {
    65  			// Default/custom AMI already in target region, nothing else to do
    66  			if ptr.Deref(awsMachine.Spec.AMI.ID, "") != "" {
    67  				return nil
    68  			}
    69  		}
    70  	}
    71  
    72  	// Notice that we have to use the default RHCOS value because we set the AMI.ID to empty if the default RHCOS is not
    73  	// in the target region and it needs to be copied over. See pkg/asset/machines/clusterapi.go
    74  	amiID, err := copyAMIToRegion(ctx, in.InstallConfig, in.InfraID, in.RhcosImage)
    75  	if err != nil {
    76  		return fmt.Errorf("failed to copy AMI: %w", err)
    77  	}
    78  	// Update manifests with the new ID
    79  	for i := range in.MachineManifests {
    80  		if awsMachine, ok := in.MachineManifests[i].(*capa.AWSMachine); ok {
    81  			awsMachine.Spec.AMI.ID = ptr.To(amiID)
    82  		}
    83  	}
    84  	return nil
    85  }
    86  
    87  // InfraReady creates private hosted zone and DNS records.
    88  func (*Provider) InfraReady(ctx context.Context, in clusterapi.InfraReadyInput) error {
    89  	awsCluster := &capa.AWSCluster{}
    90  	key := k8sClient.ObjectKey{
    91  		Name:      in.InfraID,
    92  		Namespace: capiutils.Namespace,
    93  	}
    94  	if err := in.Client.Get(ctx, key, awsCluster); err != nil {
    95  		return fmt.Errorf("failed to get AWSCluster: %w", err)
    96  	}
    97  
    98  	awsSession, err := in.InstallConfig.AWS.Session(ctx)
    99  	if err != nil {
   100  		return fmt.Errorf("failed to get aws session: %w", err)
   101  	}
   102  
   103  	subnetIDs := make([]string, 0, len(awsCluster.Spec.NetworkSpec.Subnets))
   104  	for _, s := range awsCluster.Spec.NetworkSpec.Subnets {
   105  		subnetIDs = append(subnetIDs, s.ResourceID)
   106  	}
   107  
   108  	vpcID := awsCluster.Spec.NetworkSpec.VPC.ID
   109  	if len(subnetIDs) > 0 && len(vpcID) == 0 {
   110  		// All subnets belong to the same VPC, so we only need one
   111  		vpcID, err = getVPCFromSubnets(ctx, awsSession, awsCluster.Spec.Region, subnetIDs[:1])
   112  		if err != nil {
   113  			return err
   114  		}
   115  	}
   116  
   117  	tags := map[string]string{
   118  		fmt.Sprintf("kubernetes.io/cluster/%s", in.InfraID): "owned",
   119  	}
   120  	for k, v := range awsCluster.Spec.AdditionalTags {
   121  		tags[k] = v
   122  	}
   123  
   124  	client := awsconfig.NewClient(awsSession)
   125  
   126  	phzID := in.InstallConfig.Config.AWS.HostedZone
   127  	if len(phzID) == 0 {
   128  		logrus.Infoln("Creating private Hosted Zone")
   129  		res, err := client.CreateHostedZone(ctx, &awsconfig.HostedZoneInput{
   130  			InfraID:  in.InfraID,
   131  			VpcID:    vpcID,
   132  			Region:   awsCluster.Spec.Region,
   133  			Name:     in.InstallConfig.Config.ClusterDomain(),
   134  			Role:     in.InstallConfig.Config.AWS.HostedZoneRole,
   135  			UserTags: tags,
   136  		})
   137  		if err != nil {
   138  			return fmt.Errorf("failed to create private hosted zone: %w", err)
   139  		}
   140  		phzID = aws.StringValue(res.Id)
   141  	}
   142  
   143  	logrus.Infoln("Creating Route53 records for control plane load balancer")
   144  	aliasZoneID, err := getHostedZoneIDForNLB(ctx, awsSession, awsCluster.Spec.Region, awsCluster.Status.Network.APIServerELB.Name)
   145  	if err != nil {
   146  		return fmt.Errorf("failed to find HostedZone ID for NLB: %w", err)
   147  	}
   148  	apiHost := awsCluster.Status.Network.SecondaryAPIServerELB.DNSName
   149  	if awsCluster.Status.Network.APIServerELB.Scheme == capa.ELBSchemeInternetFacing {
   150  		apiHost = awsCluster.Status.Network.APIServerELB.DNSName
   151  	}
   152  	apiIntHost := awsCluster.Spec.ControlPlaneEndpoint.Host
   153  	err = client.CreateOrUpdateRecord(ctx, in.InstallConfig.Config, apiHost, apiIntHost, phzID, aliasZoneID)
   154  	if err != nil {
   155  		return fmt.Errorf("failed to create route53 records: %w", err)
   156  	}
   157  
   158  	return nil
   159  }
   160  
   161  func getVPCFromSubnets(ctx context.Context, awsSession *session.Session, region string, subnetIDs []string) (string, error) {
   162  	var vpcID string
   163  	var lastError error
   164  	client := ec2.New(awsSession, aws.NewConfig().WithRegion(region))
   165  	err := client.DescribeSubnetsPagesWithContext(
   166  		ctx,
   167  		&ec2.DescribeSubnetsInput{SubnetIds: aws.StringSlice(subnetIDs)},
   168  		func(results *ec2.DescribeSubnetsOutput, lastPage bool) bool {
   169  			for _, subnet := range results.Subnets {
   170  				if subnet.SubnetId == nil {
   171  					continue
   172  				}
   173  				if subnet.SubnetArn == nil {
   174  					lastError = fmt.Errorf("%s has no ARN", *subnet.SubnetId)
   175  					return false
   176  				}
   177  				if subnet.VpcId == nil {
   178  					lastError = fmt.Errorf("%s has no VPC", *subnet.SubnetId)
   179  					return false
   180  				}
   181  				if subnet.AvailabilityZone == nil {
   182  					lastError = fmt.Errorf("%s has no availability zone", *subnet.SubnetId)
   183  					return false
   184  				}
   185  				// All subnets belong to the same VPC
   186  				vpcID = aws.StringValue(subnet.VpcId)
   187  				lastError = nil
   188  				return true
   189  			}
   190  			return !lastPage
   191  		},
   192  	)
   193  	if err == nil {
   194  		err = lastError
   195  	}
   196  	if err != nil {
   197  		return "", fmt.Errorf("failed to get VPC from subnets: %w", err)
   198  	}
   199  
   200  	return vpcID, nil
   201  }
   202  
   203  // getHostedZoneIDForNLB returns the HostedZone ID for a region from a known table or queries it from the LB instead.
   204  func getHostedZoneIDForNLB(ctx context.Context, awsSession *session.Session, region string, lbName string) (string, error) {
   205  	if hzID, ok := awsconfig.HostedZoneIDPerRegionNLBMap[region]; ok {
   206  		return hzID, nil
   207  	}
   208  	// If the HostedZoneID is not known, query from the LoadBalancer
   209  	input := elbv2.DescribeLoadBalancersInput{
   210  		Names: aws.StringSlice([]string{lbName}),
   211  	}
   212  	res, err := elbv2.New(awsSession).DescribeLoadBalancersWithContext(ctx, &input)
   213  	if err != nil {
   214  		var awsErr awserr.Error
   215  		if errors.As(err, &awsErr) && awsErr.Code() == elbv2.ErrCodeLoadBalancerNotFoundException {
   216  			return "", errNotFound
   217  		}
   218  		return "", fmt.Errorf("failed to list load balancers: %w", err)
   219  	}
   220  	for _, lb := range res.LoadBalancers {
   221  		return *lb.CanonicalHostedZoneId, nil
   222  	}
   223  
   224  	return "", errNotFound
   225  }
   226  
   227  // DestroyBootstrap removes aws bootstrap resources not handled
   228  // by the deletion of the bootstrap machine by the capi controllers.
   229  func (p *Provider) DestroyBootstrap(ctx context.Context, in clusterapi.BootstrapDestroyInput) error {
   230  	awsCluster := &capa.AWSCluster{}
   231  	key := k8sClient.ObjectKey{
   232  		Name:      in.Metadata.InfraID,
   233  		Namespace: capiutils.Namespace,
   234  	}
   235  	if err := in.Client.Get(ctx, key, awsCluster); err != nil {
   236  		return fmt.Errorf("failed to get AWSCluster: %w", err)
   237  	}
   238  
   239  	// Save this value for use in the post-destroy hook since we don't have capi running anymore by that point.
   240  	p.bestEffortDeleteIgnition = ptr.Deref(awsCluster.Spec.S3Bucket.BestEffortDeleteObjects, false)
   241  
   242  	var sgID string
   243  	if sg, ok := awsCluster.Status.Network.SecurityGroups[capa.SecurityGroupControlPlane]; ok && len(sg.ID) > 0 {
   244  		sgID = sg.ID
   245  	} else if ok {
   246  		return fmt.Errorf("control plane security group id is not populated in awscluster status")
   247  	} else {
   248  		keys := make([]capa.SecurityGroupRole, 0, len(awsCluster.Status.Network.SecurityGroups))
   249  		for sgr := range awsCluster.Status.Network.SecurityGroups {
   250  			keys = append(keys, sgr)
   251  		}
   252  		return fmt.Errorf("controlplane not found in cluster security groups: %v", keys)
   253  	}
   254  
   255  	region := in.Metadata.ClusterPlatformMetadata.AWS.Region
   256  	session, err := awsconfig.GetSessionWithOptions(
   257  		awsconfig.WithRegion(region),
   258  		awsconfig.WithServiceEndpoints(region, in.Metadata.ClusterPlatformMetadata.AWS.ServiceEndpoints),
   259  	)
   260  	if err != nil {
   261  		return fmt.Errorf("failed to create aws session: %w", err)
   262  	}
   263  
   264  	timeout := 15 * time.Minute
   265  	startTime := time.Now()
   266  	untilTime := startTime.Add(timeout)
   267  	timezone, _ := untilTime.Zone()
   268  	logrus.Debugf("Waiting up to %v (until %v %s) for bootstrap SSH rule to be destroyed...", timeout, untilTime.Format(time.Kitchen), timezone)
   269  	if err := wait.PollUntilContextTimeout(ctx, 15*time.Second, timeout, true,
   270  		func(ctx context.Context) (bool, error) {
   271  			if err := removeSSHRule(ctx, in.Client, in.Metadata.InfraID); err != nil {
   272  				// If the cluster object has been modified between Get and Update, k8s client will refuse to update it.
   273  				// In that case, we need to retry.
   274  				if k8serrors.IsConflict(err) {
   275  					logrus.Debugf("AWSCluster update conflict during SSH rule removal: %v", err)
   276  					return false, nil
   277  				}
   278  				return true, fmt.Errorf("failed to remove bootstrap SSH rule: %w", err)
   279  			}
   280  			return isSSHRuleGone(ctx, session, region, sgID)
   281  		},
   282  	); err != nil {
   283  		if wait.Interrupted(err) {
   284  			return fmt.Errorf("bootstrap ssh rule was not removed within %v: %w", timeout, err)
   285  		}
   286  		return fmt.Errorf("unable to remove bootstrap ssh rule: %w", err)
   287  	}
   288  	logrus.Debugf("Completed removing bootstrap SSH rule after %v", time.Since(startTime))
   289  
   290  	return nil
   291  }
   292  
   293  // removeSSHRule removes the SSH rule for accessing the bootstrap node
   294  // by removing the rule from the cluster spec and updating the object.
   295  func removeSSHRule(ctx context.Context, cl k8sClient.Client, infraID string) error {
   296  	awsCluster := &capa.AWSCluster{}
   297  	key := k8sClient.ObjectKey{
   298  		Name:      infraID,
   299  		Namespace: capiutils.Namespace,
   300  	}
   301  	if err := cl.Get(ctx, key, awsCluster); err != nil {
   302  		return fmt.Errorf("failed to get AWSCluster: %w", err)
   303  	}
   304  
   305  	postBootstrapRules := []capa.IngressRule{}
   306  	for _, rule := range awsCluster.Spec.NetworkSpec.AdditionalControlPlaneIngressRules {
   307  		if strings.EqualFold(rule.Description, awsmanifest.BootstrapSSHDescription) {
   308  			continue
   309  		}
   310  		postBootstrapRules = append(postBootstrapRules, rule)
   311  	}
   312  
   313  	// The spec has not been updated yet
   314  	if len(postBootstrapRules) < len(awsCluster.Spec.NetworkSpec.AdditionalControlPlaneIngressRules) {
   315  		awsCluster.Spec.NetworkSpec.AdditionalControlPlaneIngressRules = postBootstrapRules
   316  
   317  		if err := cl.Update(ctx, awsCluster); err != nil {
   318  			return fmt.Errorf("failed to update AWSCluster during bootstrap destroy: %w", err)
   319  		}
   320  		logrus.Debug("Updated AWSCluster to remove bootstrap SSH rule")
   321  	}
   322  
   323  	return nil
   324  }
   325  
   326  // isSSHRuleGone checks that the Public SSH rule has been removed from the security group.
   327  func isSSHRuleGone(ctx context.Context, session *session.Session, region, sgID string) (bool, error) {
   328  	sgs, err := awsconfig.DescribeSecurityGroups(ctx, session, []string{sgID}, region)
   329  	if err != nil {
   330  		return false, fmt.Errorf("error getting security group: %w", err)
   331  	}
   332  
   333  	if len(sgs) != 1 {
   334  		ids := []string{}
   335  		for _, sg := range sgs {
   336  			ids = append(ids, *sg.GroupId)
   337  		}
   338  		return false, fmt.Errorf("expected exactly one security group with id %s, but got %v", sgID, ids)
   339  	}
   340  
   341  	sg := sgs[0]
   342  	for _, rule := range sg.IpPermissions {
   343  		if ptr.Deref(rule.ToPort, 0) != 22 {
   344  			continue
   345  		}
   346  		for _, source := range rule.IpRanges {
   347  			if source.CidrIp != nil && *source.CidrIp == "0.0.0.0/0" {
   348  				ruleDesc := ptr.Deref(source.Description, "[no description]")
   349  				logrus.Debugf("Found ingress rule %s with source cidr %s. Still waiting for deletion...", ruleDesc, *source.CidrIp)
   350  				return false, nil
   351  			}
   352  		}
   353  	}
   354  
   355  	return true, nil
   356  }
   357  
   358  // PostDestroy deletes the ignition bucket after capi stopped running, so it won't try to reconcile the bucket.
   359  func (p *Provider) PostDestroy(ctx context.Context, in clusterapi.PostDestroyerInput) error {
   360  	region := in.Metadata.AWS.Region
   361  	session, err := awsconfig.GetSessionWithOptions(
   362  		awsconfig.WithRegion(region),
   363  		awsconfig.WithServiceEndpoints(region, in.Metadata.AWS.ServiceEndpoints),
   364  	)
   365  	if err != nil {
   366  		return fmt.Errorf("failed to create aws session: %w", err)
   367  	}
   368  
   369  	bucketName := awsmanifest.GetIgnitionBucketName(in.Metadata.InfraID)
   370  	if err := removeS3Bucket(ctx, session, bucketName); err != nil {
   371  		if p.bestEffortDeleteIgnition {
   372  			logrus.Warnf("failed to delete ignition bucket %s: %v", bucketName, err)
   373  			return nil
   374  		}
   375  		return fmt.Errorf("failed to delete ignition bucket %s: %w", bucketName, err)
   376  	}
   377  
   378  	return nil
   379  }
   380  
   381  // removeS3Bucket deletes an s3 bucket given its name.
   382  func removeS3Bucket(ctx context.Context, session *session.Session, bucketName string) error {
   383  	client := s3.New(session)
   384  
   385  	iter := s3manager.NewDeleteListIterator(client, &s3.ListObjectsInput{
   386  		Bucket: aws.String(bucketName),
   387  	})
   388  	err := s3manager.NewBatchDeleteWithClient(client).Delete(ctx, iter)
   389  	if err != nil && !isBucketNotFound(err) {
   390  		return err
   391  	}
   392  	logrus.Debugf("bucket %q emptied", bucketName)
   393  
   394  	if _, err := client.DeleteBucketWithContext(ctx, &s3.DeleteBucketInput{Bucket: aws.String(bucketName)}); err != nil {
   395  		if isBucketNotFound(err) {
   396  			logrus.Debugf("bucket %q already deleted", bucketName)
   397  			return nil
   398  		}
   399  		return err
   400  	}
   401  	return nil
   402  }
   403  
   404  func isBucketNotFound(err interface{}) bool {
   405  	switch s3Err := err.(type) {
   406  	case awserr.Error:
   407  		if s3Err.Code() == s3.ErrCodeNoSuchBucket {
   408  			return true
   409  		}
   410  		origErr := s3Err.OrigErr()
   411  		if origErr != nil {
   412  			return isBucketNotFound(origErr)
   413  		}
   414  	case s3manager.Error:
   415  		if s3Err.OrigErr != nil {
   416  			return isBucketNotFound(s3Err.OrigErr)
   417  		}
   418  	case s3manager.Errors:
   419  		if len(s3Err) == 1 {
   420  			return isBucketNotFound(s3Err[0])
   421  		}
   422  	}
   423  	return false
   424  }