sigs.k8s.io/cluster-api-provider-azure@v1.14.3/test/e2e/helpers.go (about)

     1  //go:build e2e
     2  // +build e2e
     3  
     4  /*
     5  Copyright 2020 The Kubernetes Authors.
     6  
     7  Licensed under the Apache License, Version 2.0 (the "License");
     8  you may not use this file except in compliance with the License.
     9  You may obtain a copy of the License at
    10  
    11      http://www.apache.org/licenses/LICENSE-2.0
    12  
    13  Unless required by applicable law or agreed to in writing, software
    14  distributed under the License is distributed on an "AS IS" BASIS,
    15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  See the License for the specific language governing permissions and
    17  limitations under the License.
    18  */
    19  
    20  package e2e
    21  
    22  import (
    23  	"bytes"
    24  	"context"
    25  	"encoding/json"
    26  	"fmt"
    27  	"io"
    28  	"net"
    29  	"net/http"
    30  	"os"
    31  	"path/filepath"
    32  	"regexp"
    33  	"strconv"
    34  	"strings"
    35  	"text/tabwriter"
    36  	"time"
    37  
    38  	"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
    39  	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5"
    40  	"github.com/blang/semver"
    41  	. "github.com/onsi/ginkgo/v2"
    42  	. "github.com/onsi/gomega"
    43  	"github.com/pkg/errors"
    44  	"github.com/pkg/sftp"
    45  	"golang.org/x/crypto/ssh"
    46  	appsv1 "k8s.io/api/apps/v1"
    47  	batchv1 "k8s.io/api/batch/v1"
    48  	corev1 "k8s.io/api/core/v1"
    49  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    50  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    51  	"k8s.io/client-go/kubernetes"
    52  	typedappsv1 "k8s.io/client-go/kubernetes/typed/apps/v1"
    53  	typedbatchv1 "k8s.io/client-go/kubernetes/typed/batch/v1"
    54  	typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    55  	infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
    56  	"sigs.k8s.io/cluster-api-provider-azure/azure"
    57  	infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1beta1"
    58  	capi_e2e "sigs.k8s.io/cluster-api/test/e2e"
    59  	"sigs.k8s.io/cluster-api/test/framework"
    60  	"sigs.k8s.io/cluster-api/test/framework/clusterctl"
    61  	"sigs.k8s.io/cluster-api/test/framework/kubernetesversions"
    62  	"sigs.k8s.io/controller-runtime/pkg/client"
    63  )
    64  
    65  const (
    66  	sshPort                               = "22"
    67  	deleteOperationTimeout                = 20 * time.Minute
    68  	retryableOperationTimeout             = 30 * time.Second
    69  	retryableDeleteOperationTimeout       = 3 * time.Minute
    70  	retryableOperationSleepBetweenRetries = 3 * time.Second
    71  	helmInstallTimeout                    = 3 * time.Minute
    72  	sshConnectionTimeout                  = 30 * time.Second
    73  )
    74  
    75  // deploymentsClientAdapter adapts a Deployment to work with WaitForDeploymentsAvailable.
    76  type deploymentsClientAdapter struct {
    77  	client typedappsv1.DeploymentInterface
    78  }
    79  
    80  // Get fetches the deployment named by the key and updates the provided object.
    81  func (c deploymentsClientAdapter) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
    82  	deployment, err := c.client.Get(ctx, key.Name, metav1.GetOptions{})
    83  	if deployObj, ok := obj.(*appsv1.Deployment); ok {
    84  		deployment.DeepCopyInto(deployObj)
    85  	}
    86  	return err
    87  }
    88  
    89  // WaitForDeploymentsAvailableInput is the input for WaitForDeploymentsAvailable.
    90  type WaitForDeploymentsAvailableInput struct {
    91  	Getter     framework.Getter
    92  	Deployment *appsv1.Deployment
    93  	Clientset  *kubernetes.Clientset
    94  }
    95  
    96  // WaitForDeploymentsAvailable waits until the Deployment has status.Available = True, that signals that
    97  // all the desired replicas are in place.
    98  // This can be used to check if Cluster API controllers installed in the management cluster are working.
    99  func WaitForDeploymentsAvailable(ctx context.Context, input WaitForDeploymentsAvailableInput, intervals ...interface{}) {
   100  	start := time.Now()
   101  	namespace, name := input.Deployment.GetNamespace(), input.Deployment.GetName()
   102  	Byf("waiting for deployment %s/%s to be available", namespace, name)
   103  	Log("starting to wait for deployment to become available")
   104  	Eventually(func() bool {
   105  		key := client.ObjectKey{Namespace: namespace, Name: name}
   106  		if err := input.Getter.Get(ctx, key, input.Deployment); err == nil {
   107  			for _, c := range input.Deployment.Status.Conditions {
   108  				if c.Type == appsv1.DeploymentAvailable && c.Status == corev1.ConditionTrue {
   109  					return true
   110  				}
   111  			}
   112  		}
   113  		return false
   114  	}, intervals...).Should(BeTrue(), func() string { return DescribeFailedDeployment(ctx, input) })
   115  	Logf("Deployment %s/%s is now available, took %v", namespace, name, time.Since(start))
   116  }
   117  
   118  // GetWaitForDeploymentsAvailableInput is a convenience func to compose a WaitForDeploymentsAvailableInput
   119  func GetWaitForDeploymentsAvailableInput(ctx context.Context, clusterProxy framework.ClusterProxy, name, namespace string, specName string) WaitForDeploymentsAvailableInput {
   120  	Expect(clusterProxy).NotTo(BeNil())
   121  	cl := clusterProxy.GetClient()
   122  	var d = &appsv1.Deployment{}
   123  	Eventually(func() error {
   124  		return cl.Get(ctx, client.ObjectKey{Name: name, Namespace: namespace}, d)
   125  	}, e2eConfig.GetIntervals(specName, "wait-deployment")...).Should(Succeed())
   126  	clientset := clusterProxy.GetClientSet()
   127  	return WaitForDeploymentsAvailableInput{
   128  		Deployment: d,
   129  		Clientset:  clientset,
   130  		Getter:     cl,
   131  	}
   132  }
   133  
   134  // DescribeFailedDeployment returns detailed output to help debug a deployment failure in e2e.
   135  func DescribeFailedDeployment(ctx context.Context, input WaitForDeploymentsAvailableInput) string {
   136  	namespace, name := input.Deployment.GetNamespace(), input.Deployment.GetName()
   137  	b := strings.Builder{}
   138  	b.WriteString(fmt.Sprintf("Deployment %s/%s failed",
   139  		namespace, name))
   140  	b.WriteString(fmt.Sprintf("\nDeployment:\n%s\n", prettyPrint(input.Deployment)))
   141  	b.WriteString(describeEvents(ctx, input.Clientset, namespace, name))
   142  	return b.String()
   143  }
   144  
   145  // jobsClientAdapter adapts a Job to work with WaitForJobAvailable.
   146  type jobsClientAdapter struct {
   147  	client typedbatchv1.JobInterface
   148  }
   149  
   150  // Get fetches the job named by the key and updates the provided object.
   151  func (c jobsClientAdapter) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
   152  	job, err := c.client.Get(ctx, key.Name, metav1.GetOptions{})
   153  	if jobObj, ok := obj.(*batchv1.Job); ok {
   154  		job.DeepCopyInto(jobObj)
   155  	}
   156  	return err
   157  }
   158  
   159  // WaitForJobCompleteInput is the input for WaitForJobComplete.
   160  type WaitForJobCompleteInput struct {
   161  	Getter    framework.Getter
   162  	Job       *batchv1.Job
   163  	Clientset *kubernetes.Clientset
   164  }
   165  
   166  // WaitForJobComplete waits until the Job completes with at least one success.
   167  func WaitForJobComplete(ctx context.Context, input WaitForJobCompleteInput, intervals ...interface{}) {
   168  	start := time.Now()
   169  	namespace, name := input.Job.GetNamespace(), input.Job.GetName()
   170  	Byf("waiting for job %s/%s to be complete", namespace, name)
   171  	Logf("waiting for job %s/%s to be complete", namespace, name)
   172  	Eventually(func() bool {
   173  		key := client.ObjectKey{Namespace: namespace, Name: name}
   174  		if err := input.Getter.Get(ctx, key, input.Job); err == nil {
   175  			for _, c := range input.Job.Status.Conditions {
   176  				if c.Type == batchv1.JobComplete && c.Status == corev1.ConditionTrue {
   177  					return input.Job.Status.Succeeded > 0
   178  				}
   179  			}
   180  		}
   181  		return false
   182  	}, intervals...).Should(BeTrue(), func() string { return DescribeFailedJob(ctx, input) })
   183  	Logf("job %s/%s is complete, took %v", namespace, name, time.Since(start))
   184  }
   185  
   186  // DescribeFailedJob returns a string with information to help debug a failed job.
   187  func DescribeFailedJob(ctx context.Context, input WaitForJobCompleteInput) string {
   188  	namespace, name := input.Job.GetNamespace(), input.Job.GetName()
   189  	b := strings.Builder{}
   190  	b.WriteString(fmt.Sprintf("Job %s/%s failed",
   191  		namespace, name))
   192  	b.WriteString(fmt.Sprintf("\nJob:\n%s\n", prettyPrint(input.Job)))
   193  	b.WriteString(describeEvents(ctx, input.Clientset, namespace, name))
   194  	b.WriteString(getJobPodLogs(ctx, input))
   195  	return b.String()
   196  }
   197  
   198  func getJobPodLogs(ctx context.Context, input WaitForJobCompleteInput) string {
   199  	podsClient := input.Clientset.CoreV1().Pods(input.Job.GetNamespace())
   200  	pods, err := podsClient.List(ctx, metav1.ListOptions{LabelSelector: fmt.Sprintf("job-name=%s", input.Job.GetName())})
   201  	if err != nil {
   202  		return err.Error()
   203  	}
   204  	logs := make(map[string]string, len(pods.Items))
   205  	for _, pod := range pods.Items {
   206  		logs[pod.Name] = getPodLogs(ctx, input.Clientset, pod)
   207  	}
   208  	b := strings.Builder{}
   209  	var lastLog string
   210  	for podName, log := range logs {
   211  		b.WriteString(fmt.Sprintf("\nLogs for pod %s:\n", podName))
   212  		if logsAreSimilar(lastLog, log) {
   213  			b.WriteString("(Omitted because of similarity to previous pod's logs.)")
   214  		} else {
   215  			b.WriteString(log)
   216  		}
   217  		lastLog = log
   218  	}
   219  	return b.String()
   220  }
   221  
   222  // logsAreSimilar compares two multi-line strings and returns true if at least 90% of the lines match.
   223  func logsAreSimilar(a, b string) bool {
   224  	if a == "" {
   225  		return false
   226  	}
   227  	a1 := strings.Split(a, "\n")
   228  	b1 := strings.Split(b, "\n")
   229  	for i := len(a1) - 1; i >= 0; i-- {
   230  		for _, v := range b1 {
   231  			if a1[i] == v {
   232  				a1 = append(a1[:i], a1[i+1:]...)
   233  				break
   234  			}
   235  		}
   236  	}
   237  	return float32(len(a1))/float32(len(b1)) < 0.1
   238  }
   239  
   240  // servicesClientAdapter adapts a Service to work with WaitForServicesAvailable.
   241  type servicesClientAdapter struct {
   242  	client typedcorev1.ServiceInterface
   243  }
   244  
   245  // Get fetches the service named by the key and updates the provided object.
   246  func (c servicesClientAdapter) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
   247  	service, err := c.client.Get(ctx, key.Name, metav1.GetOptions{})
   248  	if serviceObj, ok := obj.(*corev1.Service); ok {
   249  		service.DeepCopyInto(serviceObj)
   250  	}
   251  	return err
   252  }
   253  
   254  // WaitForDaemonsetInput is the input for WaitForDaemonset.
   255  type WaitForDaemonsetInput struct {
   256  	Getter    framework.Getter
   257  	DaemonSet *appsv1.DaemonSet
   258  	Clientset *kubernetes.Clientset
   259  }
   260  
   261  // WaitForDaemonset retries during E2E until a daemonset's pods are all Running.
   262  func WaitForDaemonset(ctx context.Context, input WaitForDaemonsetInput, intervals ...interface{}) {
   263  	start := time.Now()
   264  	namespace, name := input.DaemonSet.GetNamespace(), input.DaemonSet.GetName()
   265  	Eventually(func() bool {
   266  		key := client.ObjectKey{Namespace: namespace, Name: name}
   267  		if err := input.Getter.Get(ctx, key, input.DaemonSet); err == nil {
   268  			if input.DaemonSet.Status.DesiredNumberScheduled > 0 {
   269  				Byf("waiting for %d daemonset %s/%s pods to be Running", input.DaemonSet.Status.DesiredNumberScheduled, namespace, name)
   270  				if input.DaemonSet.Status.DesiredNumberScheduled == input.DaemonSet.Status.NumberReady {
   271  					Logf("%d daemonset %s/%s pods are running, took %v", input.DaemonSet.Status.NumberReady, namespace, name, time.Since(start))
   272  					return true
   273  				}
   274  			} else {
   275  				Byf("daemonset %s/%s has no schedulable nodes, will skip", namespace, name)
   276  				return true
   277  			}
   278  		}
   279  		return false
   280  	}, intervals...).Should(BeTrue(), func() string { return DescribeFailedDaemonset(ctx, input) })
   281  }
   282  
   283  // WaitForDaemonsets retries during E2E until all daemonsets pods are all Running.
   284  func WaitForDaemonsets(ctx context.Context, clusterProxy framework.ClusterProxy, specName string, intervals ...interface{}) {
   285  	Expect(clusterProxy).NotTo(BeNil())
   286  	cl := clusterProxy.GetClient()
   287  	var dsList = &appsv1.DaemonSetList{}
   288  	Eventually(func() error {
   289  		return cl.List(ctx, dsList)
   290  	}, intervals...).Should(Succeed())
   291  	for i := range dsList.Items {
   292  		waitForDaemonsetInput := WaitForDaemonsetInput{
   293  			DaemonSet: &dsList.Items[i],
   294  			Clientset: clusterProxy.GetClientSet(),
   295  			Getter:    cl,
   296  		}
   297  		WaitForDaemonset(ctx, waitForDaemonsetInput, intervals...)
   298  	}
   299  }
   300  
   301  // DescribeFailedDaemonset returns detailed output to help debug a daemonset failure in e2e.
   302  func DescribeFailedDaemonset(ctx context.Context, input WaitForDaemonsetInput) string {
   303  	namespace, name := input.DaemonSet.GetNamespace(), input.DaemonSet.GetName()
   304  	b := strings.Builder{}
   305  	b.WriteString(fmt.Sprintf("Service %s/%s failed",
   306  		namespace, name))
   307  	b.WriteString(fmt.Sprintf("\nService:\n%s\n", prettyPrint(input.DaemonSet)))
   308  	b.WriteString(describeEvents(ctx, input.Clientset, namespace, name))
   309  	return b.String()
   310  }
   311  
   312  // WaitForServiceAvailableInput is the input for WaitForServiceAvailable.
   313  type WaitForServiceAvailableInput struct {
   314  	Getter    framework.Getter
   315  	Service   *corev1.Service
   316  	Clientset *kubernetes.Clientset
   317  }
   318  
   319  // WaitForServiceAvailable waits until the Service has an IP address available on each Ingress.
   320  func WaitForServiceAvailable(ctx context.Context, input WaitForServiceAvailableInput, intervals ...interface{}) {
   321  	start := time.Now()
   322  	namespace, name := input.Service.GetNamespace(), input.Service.GetName()
   323  	Byf("waiting for service %s/%s to be available", namespace, name)
   324  	Logf("waiting for service %s/%s to be available", namespace, name)
   325  	Eventually(func() bool {
   326  		key := client.ObjectKey{Namespace: namespace, Name: name}
   327  		if err := input.Getter.Get(ctx, key, input.Service); err == nil {
   328  			ingress := input.Service.Status.LoadBalancer.Ingress
   329  			if len(ingress) > 0 {
   330  				for _, i := range ingress {
   331  					if net.ParseIP(i.IP) == nil {
   332  						return false
   333  					}
   334  				}
   335  				return true
   336  			}
   337  		}
   338  		return false
   339  	}, intervals...).Should(BeTrue(), func() string { return DescribeFailedService(ctx, input) })
   340  	Logf("service %s/%s is available, took %v", namespace, name, time.Since(start))
   341  }
   342  
   343  // DescribeFailedService returns a string with information to help debug a failed service.
   344  func DescribeFailedService(ctx context.Context, input WaitForServiceAvailableInput) string {
   345  	namespace, name := input.Service.GetNamespace(), input.Service.GetName()
   346  	b := strings.Builder{}
   347  	b.WriteString(fmt.Sprintf("Service %s/%s failed",
   348  		namespace, name))
   349  	b.WriteString(fmt.Sprintf("\nService:\n%s\n", prettyPrint(input.Service)))
   350  	b.WriteString(describeEvents(ctx, input.Clientset, namespace, name))
   351  	return b.String()
   352  }
   353  
   354  // describeEvents returns a string summarizing recent events involving the named object(s).
   355  func describeEvents(ctx context.Context, clientset *kubernetes.Clientset, namespace, name string) string {
   356  	b := strings.Builder{}
   357  	if clientset == nil {
   358  		b.WriteString("clientset is nil, so skipping output of relevant events")
   359  	} else {
   360  		opts := metav1.ListOptions{
   361  			FieldSelector: fmt.Sprintf("involvedObject.name=%s", name),
   362  			Limit:         20,
   363  		}
   364  		evts, err := clientset.CoreV1().Events(namespace).List(ctx, opts)
   365  		if err != nil {
   366  			b.WriteString(err.Error())
   367  		} else {
   368  			w := tabwriter.NewWriter(&b, 0, 4, 2, ' ', tabwriter.FilterHTML)
   369  			fmt.Fprintln(w, "LAST SEEN\tTYPE\tREASON\tOBJECT\tMESSAGE")
   370  			for _, e := range evts.Items {
   371  				fmt.Fprintf(w, "%s\t%s\t%s\t%s/%s\t%s\n", e.LastTimestamp, e.Type, e.Reason,
   372  					strings.ToLower(e.InvolvedObject.Kind), e.InvolvedObject.Name, e.Message)
   373  			}
   374  			w.Flush()
   375  		}
   376  	}
   377  	return b.String()
   378  }
   379  
   380  // prettyPrint returns a formatted JSON version of the object given.
   381  func prettyPrint(v interface{}) string {
   382  	b, err := json.MarshalIndent(v, "", "  ")
   383  	if err != nil {
   384  		return err.Error()
   385  	}
   386  	return string(b)
   387  }
   388  
   389  // getAvailabilityZonesForRegion uses zone information in availableZonesPerLocation.json
   390  // and returns the number of availability zones per region that would support the VM type used for e2e tests.
   391  // will return an error if the region isn't recognized
   392  // availableZonesPerLocation.json was generated by
   393  // az vm list-skus -r "virtualMachines"  -z | jq 'map({(.locationInfo[0].location + "_" + .name): .locationInfo[0].zones}) | add' > availableZonesPerLocation.json
   394  func getAvailabilityZonesForRegion(location string, size string) ([]string, error) {
   395  	wd, err := os.Getwd()
   396  	if err != nil {
   397  		return nil, err
   398  	}
   399  	file, err := os.ReadFile(filepath.Join(wd, "data", "availableZonesPerLocation.json"))
   400  	if err != nil {
   401  		return nil, err
   402  	}
   403  	var data map[string][]string
   404  
   405  	if err := json.Unmarshal(file, &data); err != nil {
   406  		return nil, err
   407  	}
   408  	key := fmt.Sprintf("%s_%s", location, size)
   409  
   410  	return data[key], nil
   411  }
   412  
   413  // logCheckpoint prints a message indicating the start or end of the current test spec,
   414  // including which Ginkgo node it's running on.
   415  //
   416  // Example output:
   417  //
   418  //	INFO: "With 1 worker node" started at Tue, 22 Sep 2020 13:19:08 PDT on Ginkgo node 2 of 3
   419  //	INFO: "With 1 worker node" ran for 18m34s on Ginkgo node 2 of 3
   420  func logCheckpoint(specTimes map[string]time.Time) {
   421  	text := CurrentSpecReport().LeafNodeText
   422  	start, started := specTimes[text]
   423  	suiteConfig, reporterConfig := GinkgoConfiguration()
   424  	if !started {
   425  		start = time.Now()
   426  		specTimes[text] = start
   427  		fmt.Fprintf(GinkgoWriter, "INFO: \"%s\" started at %s on Ginkgo node %d of %d and junit test report to file %s\n", text,
   428  			start.Format(time.RFC1123), GinkgoParallelProcess(), suiteConfig.ParallelTotal, reporterConfig.JUnitReport)
   429  	} else {
   430  		elapsed := time.Since(start)
   431  		fmt.Fprintf(GinkgoWriter, "INFO: \"%s\" ran for %s on Ginkgo node %d of %d and reported junit test to file %s\n", text,
   432  			elapsed.Round(time.Second), GinkgoParallelProcess(), suiteConfig.ParallelTotal, reporterConfig.JUnitReport)
   433  	}
   434  }
   435  
   436  // getClusterName gets the cluster name for the test cluster
   437  // and sets the environment variables that depend on it.
   438  func getClusterName(prefix, specName string) string {
   439  	clusterName := os.Getenv("CLUSTER_NAME")
   440  	if clusterName == "" {
   441  		clusterName = fmt.Sprintf("%s-%s", prefix, specName)
   442  	}
   443  	fmt.Fprintf(GinkgoWriter, "INFO: Cluster name is %s\n", clusterName)
   444  
   445  	Expect(os.Setenv(AzureResourceGroup, clusterName)).To(Succeed())
   446  	Expect(os.Setenv(AzureVNetName, fmt.Sprintf("%s-vnet", clusterName))).To(Succeed())
   447  	return clusterName
   448  }
   449  
   450  func isAzureMachineWindows(am *infrav1.AzureMachine) bool {
   451  	return am.Spec.OSDisk.OSType == azure.WindowsOS
   452  }
   453  
   454  func isAzureMachinePoolWindows(amp *infrav1exp.AzureMachinePool) bool {
   455  	return amp.Spec.Template.OSDisk.OSType == azure.WindowsOS
   456  }
   457  
   458  // getProxiedSSHClient creates a SSH client object that connects to a target node
   459  // proxied through a control plane node.
   460  func getProxiedSSHClient(controlPlaneEndpoint, hostname, port string, ioTimeout time.Duration) (*ssh.Client, error) {
   461  	config, err := newSSHConfig()
   462  	if err != nil {
   463  		return nil, err
   464  	}
   465  
   466  	// Init a client connection to a control plane node via the public load balancer
   467  	c, err := net.DialTimeout("tcp", fmt.Sprintf("%s:%s", controlPlaneEndpoint, port), config.Timeout)
   468  	if err != nil {
   469  		return nil, errors.Wrapf(err, "dialing public load balancer at %s", controlPlaneEndpoint)
   470  	}
   471  	err = c.SetDeadline(time.Now().Add(ioTimeout))
   472  	if err != nil {
   473  		return nil, errors.Wrapf(err, "setting timeout for connection to public load balancer at %s", controlPlaneEndpoint)
   474  	}
   475  	conn, chans, reqs, err := ssh.NewClientConn(c, fmt.Sprintf("%s:%s", controlPlaneEndpoint, port), config)
   476  	if err != nil {
   477  		return nil, errors.Wrapf(err, "connecting to public load balancer at %s", controlPlaneEndpoint)
   478  	}
   479  	lbClient := ssh.NewClient(conn, chans, reqs)
   480  
   481  	// Init a connection from the control plane to the target node
   482  	c, err = lbClient.Dial("tcp", fmt.Sprintf("%s:%s", hostname, port))
   483  	if err != nil {
   484  		return nil, errors.Wrapf(err, "dialing from control plane to target node at %s", hostname)
   485  	}
   486  
   487  	// Establish an authenticated SSH conn over the client -> control plane -> target transport
   488  	conn, chans, reqs, err = ssh.NewClientConn(c, hostname, config)
   489  	if err != nil {
   490  		return nil, errors.Wrap(err, "getting a new SSH client connection")
   491  	}
   492  	client := ssh.NewClient(conn, chans, reqs)
   493  	return client, nil
   494  }
   495  
   496  // execOnHost runs the specified command directly on a node's host, using a SSH connection
   497  // proxied through a control plane host and copies the output to a file.
   498  func execOnHost(controlPlaneEndpoint, hostname, port string, ioTimeout time.Duration, f io.StringWriter, command string,
   499  	args ...string) error {
   500  	client, err := getProxiedSSHClient(controlPlaneEndpoint, hostname, port, ioTimeout)
   501  	if err != nil {
   502  		return err
   503  	}
   504  
   505  	session, err := client.NewSession()
   506  	if err != nil {
   507  		return errors.Wrap(err, "opening SSH session")
   508  	}
   509  	defer session.Close()
   510  
   511  	// Run the command and write the captured stdout to the file
   512  	var stdoutBuf bytes.Buffer
   513  	session.Stdout = &stdoutBuf
   514  	if len(args) > 0 {
   515  		command += " " + strings.Join(args, " ")
   516  	}
   517  	if err = session.Run(command); err != nil {
   518  		return errors.Wrapf(err, "running command \"%s\"", command)
   519  	}
   520  	if _, err = f.WriteString(stdoutBuf.String()); err != nil {
   521  		return errors.Wrap(err, "writing output to file")
   522  	}
   523  
   524  	return nil
   525  }
   526  
   527  // sftpCopyFile copies a file from a node to the specified destination, using a SSH connection
   528  // proxied through a control plane node.
   529  func sftpCopyFile(controlPlaneEndpoint, hostname, port string, ioTimeout time.Duration, sourcePath, destPath string) error {
   530  	Logf("Attempting to copy file %s on node %s to %s", sourcePath, hostname, destPath)
   531  
   532  	client, err := getProxiedSSHClient(controlPlaneEndpoint, hostname, port, ioTimeout)
   533  	if err != nil {
   534  		return err
   535  	}
   536  
   537  	sftp, err := sftp.NewClient(client)
   538  	if err != nil {
   539  		return errors.Wrapf(err, "getting a new sftp client connection")
   540  	}
   541  	defer sftp.Close()
   542  
   543  	// copy file
   544  	sourceFile, err := sftp.Open(sourcePath)
   545  	if err != nil {
   546  		return errors.Wrapf(err, "opening file %s on node %s", sourcePath, hostname)
   547  	}
   548  	defer sourceFile.Close()
   549  
   550  	destFile, err := os.Create(destPath)
   551  	if err != nil {
   552  		return errors.Wrapf(err, "creating file %s on locally", sourcePath)
   553  	}
   554  	defer destFile.Close()
   555  
   556  	_, err = sourceFile.WriteTo(destFile)
   557  	if err != nil {
   558  		return errors.Wrapf(err, "writing to %s", destPath)
   559  	}
   560  
   561  	return nil
   562  }
   563  
   564  // fileOnHost creates the specified path, including parent directories if needed.
   565  func fileOnHost(path string) (*os.File, error) {
   566  	if err := os.MkdirAll(filepath.Dir(path), os.ModePerm); err != nil {
   567  		return nil, err
   568  	}
   569  	return os.Create(path)
   570  }
   571  
   572  // newSSHConfig returns an SSH config for a workload cluster in the current e2e test run.
   573  func newSSHConfig() (*ssh.ClientConfig, error) {
   574  	// find private key file used for e2e workload cluster
   575  	keyfile := os.Getenv("AZURE_SSH_PUBLIC_KEY_FILE")
   576  	if len(keyfile) > 4 && strings.HasSuffix(keyfile, "pub") {
   577  		keyfile = keyfile[:(len(keyfile) - 4)]
   578  	}
   579  	if keyfile == "" {
   580  		keyfile = ".sshkey"
   581  	}
   582  	if _, err := os.Stat(keyfile); os.IsNotExist(err) {
   583  		if !filepath.IsAbs(keyfile) {
   584  			// current working directory may be test/e2e, so look in the project root
   585  			keyfile = filepath.Join("..", "..", keyfile)
   586  		}
   587  	}
   588  
   589  	pubkey, err := publicKeyFile(keyfile)
   590  	if err != nil {
   591  		return nil, err
   592  	}
   593  	sshConfig := ssh.ClientConfig{
   594  		HostKeyCallback: ssh.InsecureIgnoreHostKey(), //nolint:gosec // Non-production code
   595  		User:            azure.DefaultUserName,
   596  		Auth:            []ssh.AuthMethod{pubkey},
   597  		Timeout:         sshConnectionTimeout,
   598  	}
   599  	return &sshConfig, nil
   600  }
   601  
   602  // publicKeyFile parses and returns the public key from the specified private key file.
   603  func publicKeyFile(file string) (ssh.AuthMethod, error) {
   604  	buffer, err := os.ReadFile(file)
   605  	if err != nil {
   606  		return nil, err
   607  	}
   608  	signer, err := ssh.ParsePrivateKey(buffer)
   609  	if err != nil {
   610  		return nil, err
   611  	}
   612  	return ssh.PublicKeys(signer), nil
   613  }
   614  
   615  // validateStableReleaseString validates the string format that declares "get be the latest stable release for this <Major>.<Minor>"
   616  // it should be called wherever we process a stable version string expression like "stable-1.22"
   617  func validateStableReleaseString(stableVersion string) (isStable bool, matches []string) {
   618  	stableReleaseFormat := regexp.MustCompile(`^stable-(0|[1-9]\d*)\.(0|[1-9]\d*)$`)
   619  	matches = stableReleaseFormat.FindStringSubmatch(stableVersion)
   620  	return len(matches) > 0, matches
   621  }
   622  
   623  // resolveCIVersion resolves kubernetes version labels (e.g. latest, latest-1.xx) to the corresponding CI version numbers.
   624  // Go implementation of https://github.com/kubernetes-sigs/cluster-api/blob/d1dc87d5df3ab12a15ae5b63e50541a191b7fec4/scripts/ci-e2e-lib.sh#L75-L95.
   625  func resolveCIVersion(label string) (string, error) {
   626  	if ciVersion, ok := os.LookupEnv("CI_VERSION"); ok {
   627  		return ciVersion, nil
   628  	}
   629  	if strings.HasPrefix(label, "latest") {
   630  		if kubernetesVersion, err := latestCIVersion(label); err == nil {
   631  			return kubernetesVersion, nil
   632  		}
   633  	}
   634  
   635  	// default to https://dl.k8s.io/ci/latest.txt if the label can't be resolved
   636  	return kubernetesversions.LatestCIRelease()
   637  }
   638  
   639  // latestCIVersion returns the latest CI version of a given label in the form of latest-1.xx.
   640  func latestCIVersion(label string) (string, error) {
   641  	ciVersionURL := fmt.Sprintf("https://dl.k8s.io/ci/%s.txt", label)
   642  	req, err := http.NewRequestWithContext(context.TODO(), http.MethodGet, ciVersionURL, http.NoBody)
   643  	if err != nil {
   644  		return "", err
   645  	}
   646  	resp, err := http.DefaultClient.Do(req)
   647  	if err != nil {
   648  		return "", err
   649  	}
   650  	defer resp.Body.Close()
   651  	b, err := io.ReadAll(resp.Body)
   652  	if err != nil {
   653  		return "", err
   654  	}
   655  
   656  	return strings.TrimSpace(string(b)), nil
   657  }
   658  
   659  // resolveKubetestRepoListPath will set the correct repo list for Windows:
   660  // - if WIN_REPO_URL is set use the custom file downloaded via makefile
   661  // - if CI version is "latest" do not set repo list since they are not needed K8s v1.24+
   662  // - if CI version is  "latest-1.xx" will compare values and use correct repoList
   663  // - if standard version will compare values and use correct repoList
   664  // - if unable to determine version falls back to using latest
   665  func resolveKubetestRepoListPath(version string, path string) (string, error) {
   666  	if _, ok := os.LookupEnv("WIN_REPO_URL"); ok {
   667  		return filepath.Join(path, "custom-repo-list.yaml"), nil
   668  	}
   669  
   670  	if version == "latest" {
   671  		return "", nil
   672  	}
   673  
   674  	version = strings.TrimPrefix(version, "latest-")
   675  	currentVersion, err := semver.ParseTolerant(version)
   676  	if err != nil {
   677  		return "", err
   678  	}
   679  
   680  	v124, err := semver.Make("1.24.0-alpha.0.0")
   681  	if err != nil {
   682  		return "", err
   683  	}
   684  
   685  	if currentVersion.GT(v124) {
   686  		return "", nil
   687  	}
   688  
   689  	// - prior to K8s v1.21 repo-list-k8sprow.yaml should be used
   690  	//   since all test images need to come from k8sprow.azurecr.io
   691  	// - starting with K8s v1.24 repo lists repo list is not needed
   692  	// - use repo-list.yaml for everything in between which has only
   693  	//   some images in k8sprow.azurecr.io
   694  
   695  	return filepath.Join(path, "repo-list.yaml"), nil
   696  }
   697  
   698  // resolveKubernetesVersions looks at Kubernetes versions set as variables in the e2e config and sets them to a valid k8s version
   699  // that has an existing capi offer image available. For example, if the version is "stable-1.22", the function will set it to the latest 1.22 version that has a published reference image.
   700  func resolveKubernetesVersions(config *clusterctl.E2EConfig) {
   701  	ubuntuVersions := getVersionsInOffer(context.TODO(), os.Getenv(AzureLocation), capiImagePublisher, capiOfferName)
   702  	windowsVersions := getVersionsInOffer(context.TODO(), os.Getenv(AzureLocation), capiImagePublisher, capiWindowsOfferName)
   703  	flatcarK8sVersions := getFlatcarK8sVersions(context.TODO(), os.Getenv(AzureLocation), flatcarCAPICommunityGallery)
   704  
   705  	// find the intersection of ubuntu and windows versions available, since we need an image for both.
   706  	var versions semver.Versions
   707  	for k, v := range ubuntuVersions {
   708  		if _, ok := windowsVersions[k]; ok {
   709  			versions = append(versions, v)
   710  		}
   711  	}
   712  
   713  	if config.HasVariable(capi_e2e.KubernetesVersion) {
   714  		resolveKubernetesVersion(config, versions, capi_e2e.KubernetesVersion)
   715  	}
   716  	if config.HasVariable(capi_e2e.KubernetesVersionUpgradeFrom) {
   717  		resolveKubernetesVersion(config, versions, capi_e2e.KubernetesVersionUpgradeFrom)
   718  	}
   719  	if config.HasVariable(capi_e2e.KubernetesVersionUpgradeTo) {
   720  		resolveKubernetesVersion(config, versions, capi_e2e.KubernetesVersionUpgradeTo)
   721  	}
   722  	if config.HasVariable(FlatcarKubernetesVersion) && config.HasVariable(FlatcarVersion) {
   723  		resolveFlatcarKubernetesVersion(config, flatcarK8sVersions, FlatcarKubernetesVersion)
   724  		flatcarVersions := getFlatcarVersions(context.TODO(), os.Getenv(AzureLocation), flatcarCAPICommunityGallery, config.GetVariable(FlatcarKubernetesVersion))
   725  		resolveFlatcarVersion(config, flatcarVersions, FlatcarVersion)
   726  	}
   727  }
   728  
   729  func resolveKubernetesVersion(config *clusterctl.E2EConfig, versions semver.Versions, varName string) {
   730  	resolveVariable(config, varName, getLatestVersionForMinor(config.GetVariable(varName), versions, "capi offer"))
   731  }
   732  
   733  func resolveVariable(config *clusterctl.E2EConfig, varName, v string) {
   734  	oldVersion := config.GetVariable(varName)
   735  	if _, ok := os.LookupEnv(varName); ok {
   736  		Expect(os.Setenv(varName, v)).To(Succeed())
   737  	}
   738  	config.Variables[varName] = v
   739  	Logf("Resolved %s (set to %s) to %s", varName, oldVersion, v)
   740  }
   741  
   742  func resolveFlatcarKubernetesVersion(config *clusterctl.E2EConfig, versions semver.Versions, varName string) {
   743  	resolveVariable(config, varName, getLatestVersionForMinor(config.GetVariable(varName), versions, "Flatcar Community Gallery"))
   744  }
   745  
   746  func resolveFlatcarVersion(config *clusterctl.E2EConfig, versions semver.Versions, varName string) {
   747  	version := config.GetVariable(varName)
   748  	if version != "latest" {
   749  		Expect(versions).To(ContainElement(semver.MustParse(version)), fmt.Sprintf("Provided Flatcar version %q does not have a corresponding VM image in the Flatcar Community Gallery", version))
   750  	}
   751  
   752  	if version == "latest" {
   753  		semver.Sort(versions)
   754  		version = versions[len(versions)-1].String()
   755  	}
   756  
   757  	resolveVariable(config, varName, version)
   758  }
   759  
   760  // newImagesClient returns a new VM images client using environmental settings for auth.
   761  func newImagesClient() *armcompute.VirtualMachineImagesClient {
   762  	cred, err := azidentity.NewDefaultAzureCredential(nil)
   763  	Expect(err).NotTo(HaveOccurred())
   764  	imagesClient, err := armcompute.NewVirtualMachineImagesClient(getSubscriptionID(Default), cred, nil)
   765  	Expect(err).NotTo(HaveOccurred())
   766  
   767  	return imagesClient
   768  }
   769  
   770  func newCommunityGalleryImagesClient() *armcompute.CommunityGalleryImagesClient {
   771  	cred, err := azidentity.NewDefaultAzureCredential(nil)
   772  	Expect(err).NotTo(HaveOccurred())
   773  	communityGalleryImagesClient, err := armcompute.NewCommunityGalleryImagesClient(getSubscriptionID(Default), cred, nil)
   774  	Expect(err).NotTo(HaveOccurred())
   775  
   776  	return communityGalleryImagesClient
   777  }
   778  
   779  func newCommunityGalleryImageVersionsClient() *armcompute.CommunityGalleryImageVersionsClient {
   780  	cred, err := azidentity.NewDefaultAzureCredential(nil)
   781  	Expect(err).NotTo(HaveOccurred())
   782  	communityGalleryImageVersionsClient, err := armcompute.NewCommunityGalleryImageVersionsClient(getSubscriptionID(Default), cred, nil)
   783  	Expect(err).NotTo(HaveOccurred())
   784  
   785  	return communityGalleryImageVersionsClient
   786  }
   787  
   788  // getVersionsInOffer returns a map of Kubernetes versions as strings to semver.Versions.
   789  func getVersionsInOffer(ctx context.Context, location, publisher, offer string) map[string]semver.Version {
   790  	Logf("Finding image skus and versions for offer %s/%s in %s", publisher, offer, location)
   791  	var versions map[string]semver.Version
   792  	capiSku := regexp.MustCompile(`^[\w-]+-gen[12]$`)
   793  	capiVersion := regexp.MustCompile(`^(\d)(\d{1,2})\.(\d{1,2})\.\d{8}$`)
   794  	oldCapiSku := regexp.MustCompile(`^k8s-(0|[1-9][0-9]*)dot(0|[1-9][0-9]*)dot(0|[1-9][0-9]*)-[a-z]*.*$`)
   795  	imagesClient := newImagesClient()
   796  	resp, err := imagesClient.ListSKUs(ctx, location, publisher, offer, nil)
   797  	Expect(err).NotTo(HaveOccurred())
   798  
   799  	skus := resp.VirtualMachineImageResourceArray
   800  
   801  	versions = make(map[string]semver.Version, len(skus))
   802  	for _, sku := range skus {
   803  		res, err := imagesClient.List(ctx, location, publisher, offer, *sku.Name, nil)
   804  		Expect(err).NotTo(HaveOccurred())
   805  		// Don't use SKUs without existing images. See https://github.com/Azure/azure-cli/issues/20115.
   806  		if len(res.VirtualMachineImageResourceArray) > 0 {
   807  			// New SKUs don't contain the Kubernetes version and are named like "ubuntu-2004-gen1".
   808  			if match := capiSku.FindStringSubmatch(*sku.Name); len(match) > 0 {
   809  				for _, vmImage := range res.VirtualMachineImageResourceArray {
   810  					// Versions are named like "121.13.20220601", for Kubernetes v1.21.13 published on June 1, 2022.
   811  					match = capiVersion.FindStringSubmatch(*vmImage.Name)
   812  					stringVer := fmt.Sprintf("%s.%s.%s", match[1], match[2], match[3])
   813  					versions[stringVer] = semver.MustParse(stringVer)
   814  				}
   815  				continue
   816  			}
   817  			// Old SKUs before 1.21.12, 1.22.9, or 1.23.6 are named like "k8s-1dot21dot2-ubuntu-2004".
   818  			if match := oldCapiSku.FindStringSubmatch(*sku.Name); len(match) > 0 {
   819  				stringVer := fmt.Sprintf("%s.%s.%s", match[1], match[2], match[3])
   820  				versions[stringVer] = semver.MustParse(stringVer)
   821  			}
   822  		}
   823  	}
   824  
   825  	return versions
   826  }
   827  
   828  // getLatestVersionForMinor gets the latest available patch version in the provided list of sku versions that corresponds to the provided k8s version.
   829  func getLatestVersionForMinor(version string, versions semver.Versions, imagesSource string) string {
   830  	isStable, match := validateStableReleaseString(version)
   831  	if isStable {
   832  		// if the version is in the format "stable-1.21", we find the latest 1.21.x version.
   833  		major, err := strconv.ParseUint(match[1], 10, 64)
   834  		Expect(err).NotTo(HaveOccurred())
   835  		minor, err := strconv.ParseUint(match[2], 10, 64)
   836  		Expect(err).NotTo(HaveOccurred())
   837  		semver.Sort(versions)
   838  		for i := len(versions) - 1; i >= 0; i-- {
   839  			if versions[i].Major == major && versions[i].Minor == minor {
   840  				version = "v" + versions[i].String()
   841  				break
   842  			}
   843  		}
   844  	} else if v, err := semver.ParseTolerant(version); err == nil {
   845  		if len(v.Pre) == 0 {
   846  			// if the version is in the format "v1.21.2", we make sure we have an existing image for it.
   847  			Expect(versions).To(ContainElement(v), fmt.Sprintf("Provided Kubernetes version %s does not have a corresponding VM image in the %q", version, imagesSource))
   848  		}
   849  	}
   850  	// otherwise, we just return the version as-is. This allows for versions in other formats, such as "latest" or "latest-1.21".
   851  	return version
   852  }
   853  
   854  func getFlatcarVersions(ctx context.Context, location, galleryName, k8sVersion string) semver.Versions {
   855  	image := fmt.Sprintf("flatcar-stable-amd64-capi-%s", k8sVersion)
   856  
   857  	Logf("Finding Flatcar versions in community gallery %q in location %q for image %q", galleryName, location, image)
   858  	var versions semver.Versions
   859  	communityGalleryImageVersionsClient := newCommunityGalleryImageVersionsClient()
   860  	var imageVersions []*armcompute.CommunityGalleryImageVersion
   861  	pager := communityGalleryImageVersionsClient.NewListPager(location, galleryName, image, nil)
   862  	for pager.More() {
   863  		nextResult, err := pager.NextPage(ctx)
   864  		Expect(err).NotTo(HaveOccurred())
   865  		imageVersions = append(imageVersions, nextResult.Value...)
   866  	}
   867  
   868  	for _, imageVersion := range imageVersions {
   869  		versions = append(versions, semver.MustParse(*imageVersion.Name))
   870  	}
   871  
   872  	return versions
   873  }
   874  
   875  func getFlatcarK8sVersions(ctx context.Context, location, communityGalleryName string) semver.Versions {
   876  	Logf("Finding Flatcar images and versions in community gallery %q in location %q", communityGalleryName, location)
   877  	var versions semver.Versions
   878  	k8sVersion := regexp.MustCompile(`flatcar-stable-amd64-capi-v(\d+)\.(\d+).(\d+)`)
   879  	communityGalleryImagesClient := newCommunityGalleryImagesClient()
   880  	communityGalleryImageVersionsClient := newCommunityGalleryImageVersionsClient()
   881  	var images []*armcompute.CommunityGalleryImage
   882  	pager := communityGalleryImagesClient.NewListPager(location, communityGalleryName, nil)
   883  	for pager.More() {
   884  		nextResult, err := pager.NextPage(ctx)
   885  		Expect(err).NotTo(HaveOccurred())
   886  		images = append(images, nextResult.Value...)
   887  	}
   888  
   889  	for _, image := range images {
   890  		var imageVersions []*armcompute.CommunityGalleryImageVersion
   891  		pager := communityGalleryImageVersionsClient.NewListPager(location, communityGalleryName, *image.Name, nil)
   892  		for pager.More() {
   893  			nextResult, err := pager.NextPage(ctx)
   894  			Expect(err).NotTo(HaveOccurred())
   895  			imageVersions = append(imageVersions, nextResult.Value...)
   896  		}
   897  
   898  		if len(imageVersions) == 0 {
   899  			continue
   900  		}
   901  
   902  		match := k8sVersion.FindStringSubmatch(*image.Name)
   903  		stringVer := fmt.Sprintf("%s.%s.%s", match[1], match[2], match[3])
   904  		versions = append(versions, semver.MustParse(stringVer))
   905  	}
   906  
   907  	return versions
   908  }
   909  
   910  // getPodLogs returns the logs of a pod, or an error in string format.
   911  func getPodLogs(ctx context.Context, clientset *kubernetes.Clientset, pod corev1.Pod) string {
   912  	req := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &corev1.PodLogOptions{})
   913  	logs, err := req.Stream(ctx)
   914  	if err != nil {
   915  		return fmt.Sprintf("error streaming logs for pod %s: %v", pod.Name, err)
   916  	}
   917  	defer logs.Close()
   918  
   919  	b := new(bytes.Buffer)
   920  	if _, err = io.Copy(b, logs); err != nil {
   921  		return fmt.Sprintf("error copying logs for pod %s: %v", pod.Name, err)
   922  	}
   923  	return b.String()
   924  }
   925  
   926  func CopyConfigMap(ctx context.Context, input clusterctl.ApplyCustomClusterTemplateAndWaitInput, cl client.Client, cmName, fromNamespace, toNamespace string) {
   927  	cm := &corev1.ConfigMap{}
   928  	Eventually(func(g Gomega) {
   929  		g.Expect(cl.Get(ctx, client.ObjectKey{Name: cmName, Namespace: fromNamespace}, cm)).To(Succeed())
   930  		cm.SetNamespace(toNamespace)
   931  		cm.SetResourceVersion("")
   932  		framework.EnsureNamespace(ctx, cl, toNamespace)
   933  		err := cl.Create(ctx, cm.DeepCopy())
   934  		if !apierrors.IsAlreadyExists(err) {
   935  			g.Expect(err).To(Succeed())
   936  		}
   937  	}, input.WaitForControlPlaneIntervals...).Should(Succeed())
   938  }
   939  
   940  func getSubscriptionID(g Gomega) string {
   941  	subscriptionID := os.Getenv("AZURE_SUBSCRIPTION_ID")
   942  	g.Expect(subscriptionID).NotTo(BeEmpty())
   943  	return subscriptionID
   944  }