github.com/verrazzano/verrazzano@v1.7.1/tests/e2e/ha/inplaceupgrade/in_place_upgrade_test.go (about)

     1  // Copyright (c) 2022, 2023, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  package inplaceupgrade
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"os"
    11  	"os/exec"
    12  	"strconv"
    13  	"time"
    14  
    15  	. "github.com/onsi/ginkgo/v2"
    16  	. "github.com/onsi/gomega"
    17  	"github.com/oracle/oci-go-sdk/v53/common"
    18  	"github.com/oracle/oci-go-sdk/v53/common/auth"
    19  	ocice "github.com/oracle/oci-go-sdk/v53/containerengine"
    20  	ocicore "github.com/oracle/oci-go-sdk/v53/core"
    21  	"github.com/verrazzano/verrazzano/pkg/k8sutil"
    22  	hacommon "github.com/verrazzano/verrazzano/tests/e2e/pkg/ha"
    23  	"github.com/verrazzano/verrazzano/tests/e2e/pkg/test/framework"
    24  	corev1 "k8s.io/api/core/v1"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  )
    27  
    28  const (
    29  	clusterIDEnvVar   = "OKE_CLUSTER_ID"
    30  	ociRegionEnvVar   = "OCI_CLI_REGION"
    31  	skipUpgradeEnvVar = "SKIP_KUBERNETES_UPGRADE"
    32  
    33  	waitTimeout             = 20 * time.Minute
    34  	waitTimeoutControlPlane = 2 * time.Hour
    35  
    36  	pollingInterval = 30 * time.Second
    37  
    38  	waitForDeleteTimeout = 600 * time.Second
    39  )
    40  
    41  var clientset = k8sutil.GetKubernetesClientsetOrDie()
    42  var t = framework.NewTestFramework("in_place_upgrade")
    43  
    44  var (
    45  	failed                    bool
    46  	region                    string
    47  	clusterID                 string
    48  	skipClusterVersionUpgrade bool
    49  
    50  	okeClient     ocice.ContainerEngineClient
    51  	computeClient ocicore.ComputeClient
    52  )
    53  
    54  var _ = t.AfterEach(func() {
    55  	failed = failed || CurrentSpecReport().Failed()
    56  })
    57  
    58  var beforeSuite = t.BeforeSuiteFunc(func() {
    59  	clusterID = os.Getenv(clusterIDEnvVar)
    60  	region = os.Getenv(ociRegionEnvVar)
    61  
    62  	if skipUpgradeVal, set := os.LookupEnv(skipUpgradeEnvVar); set {
    63  		var parseErr error
    64  		skipClusterVersionUpgrade, parseErr = strconv.ParseBool(skipUpgradeVal)
    65  		Expect(parseErr).ShouldNot(HaveOccurred(), fmt.Sprintf("Invalid value for %s: %s", skipUpgradeVal, skipUpgradeVal))
    66  	}
    67  
    68  	Expect(clusterID).ToNot(BeEmpty(), fmt.Sprintf("%s env var must be set", clusterIDEnvVar))
    69  	// region is optional so don't Expect
    70  
    71  	var provider common.ConfigurationProvider
    72  	var err error
    73  	provider, err = getOCISDKProvider(region)
    74  	Expect(err).ShouldNot(HaveOccurred(), "Error configuring OCI SDK provider")
    75  
    76  	okeClient, err = ocice.NewContainerEngineClientWithConfigurationProvider(provider)
    77  	Expect(err).ShouldNot(HaveOccurred(), "Error configuring OCI SDK container engine client")
    78  
    79  	computeClient, err = ocicore.NewComputeClientWithConfigurationProvider(provider)
    80  	Expect(err).ShouldNot(HaveOccurred(), "Error configuring OCI SDK compute client")
    81  })
    82  
    83  var _ = BeforeSuite(beforeSuite)
    84  
    85  var afterSuite = t.AfterSuiteFunc(func() {
    86  	// signal that the upgrade is done so the tests know to stop
    87  	hacommon.EventuallyCreateShutdownSignal(clientset, t.Logs)
    88  })
    89  
    90  var _ = AfterSuite(afterSuite)
    91  
    92  var _ = t.Describe("OKE In-Place Upgrade", Label("f:platform-lcm:ha"), func() {
    93  	var clusterResponse ocice.GetClusterResponse
    94  	var upgradeVersion string
    95  
    96  	t.It("upgrades the control plane Kubernetes version", func() {
    97  		if skipClusterVersionUpgrade {
    98  			t.Logs.Infof("%s=%v, skipping cluster Control Plane upgrade", skipUpgradeEnvVar, skipClusterVersionUpgrade)
    99  			return
   100  		}
   101  
   102  		// first get the cluster details and find the available upgrade versions
   103  		var err error
   104  		clusterResponse, err = okeClient.GetCluster(context.Background(), ocice.GetClusterRequest{ClusterId: &clusterID})
   105  		Expect(err).ShouldNot(HaveOccurred())
   106  		t.Logs.Debugf("Cluster response: %+v", clusterResponse)
   107  		Expect(clusterResponse.AvailableKubernetesUpgrades).ToNot(BeEmpty(), "No available upgrade versions")
   108  
   109  		// upgrade the control plane to the first available upgrade version
   110  		upgradeVersion = clusterResponse.AvailableKubernetesUpgrades[0]
   111  		t.Logs.Infof("Upgrading the OKE cluster control plane to version: %s", upgradeVersion)
   112  		details := ocice.UpdateClusterDetails{KubernetesVersion: &upgradeVersion}
   113  		updateResponse, err := okeClient.UpdateCluster(context.Background(), ocice.UpdateClusterRequest{ClusterId: &clusterID, UpdateClusterDetails: details})
   114  		Expect(err).ShouldNot(HaveOccurred())
   115  		Expect(updateResponse.OpcWorkRequestId).ShouldNot(BeNil())
   116  
   117  		// wait for the work request to complete, this can take roughly 5-15 minutes
   118  		waitForWorkRequest(*updateResponse.OpcWorkRequestId, waitTimeoutControlPlane)
   119  	})
   120  
   121  	t.It("upgrades the node pool Kubernetes version", func() {
   122  		if skipClusterVersionUpgrade {
   123  			t.Logs.Infof("%s=%v, skipping node pool upgrade", skipUpgradeEnvVar, skipClusterVersionUpgrade)
   124  			return
   125  		}
   126  		// first get the node pool, the cluster response struct does not have node pools so we have to list the node pools
   127  		// in the compartment and filter by the cluster id
   128  		nodePoolsResponse, err := okeClient.ListNodePools(context.Background(), ocice.ListNodePoolsRequest{CompartmentId: clusterResponse.CompartmentId, ClusterId: clusterResponse.Id})
   129  		Expect(err).ShouldNot(HaveOccurred())
   130  		Expect(len(nodePoolsResponse.Items)).To(Equal(1))
   131  
   132  		// upgrade the node pool to the same Kubernetes version as the control plane
   133  		t.Logs.Infof("Upgrading the OKE cluster node pool to version: %s", upgradeVersion)
   134  		details := ocice.UpdateNodePoolDetails{KubernetesVersion: &upgradeVersion}
   135  		updateResponse, err := okeClient.UpdateNodePool(context.Background(), ocice.UpdateNodePoolRequest{NodePoolId: nodePoolsResponse.Items[0].Id, UpdateNodePoolDetails: details})
   136  		Expect(err).ShouldNot(HaveOccurred())
   137  		Expect(updateResponse.OpcWorkRequestId).ShouldNot(BeNil())
   138  
   139  		// wait for the work request to complete
   140  		waitForWorkRequest(*updateResponse.OpcWorkRequestId, waitTimeout)
   141  	})
   142  
   143  	t.It("replaces each worker node in the node pool", func() {
   144  		// get the nodes
   145  		nodes := hacommon.EventuallyGetNodes(clientset, t.Logs)
   146  		latestNodes := nodes
   147  		for _, node := range nodes.Items {
   148  			if !hacommon.IsControlPlaneNode(node) {
   149  				// cordon and drain the node - this function is implemented in kubectl itself and is not available
   150  				// using a k8s client
   151  				t.Logs.Infof("Draining node: %s", node.Name)
   152  				kubectlArgs := []string{
   153  					"drain",
   154  					"--ignore-daemonsets",
   155  					"--delete-emptydir-data",
   156  					"--force",
   157  					fmt.Sprintf("--skip-wait-for-delete-timeout=%v", int(waitForDeleteTimeout.Seconds())),
   158  					"--timeout=15m",
   159  					node.Name,
   160  				}
   161  				out, err := exec.Command("kubectl", kubectlArgs...).CombinedOutput() //nolint:gosec //#nosec G204
   162  				t.Logs.Infof("Combined output from kubectl drain command: %s", out)
   163  				if err != nil {
   164  					t.Logs.Infof("Error occurred running kubectl drain command: %s", err.Error())
   165  				}
   166  				Expect(err).ShouldNot(HaveOccurred())
   167  
   168  				// terminate the compute instance that the node is on, OKE will replace it with a new node
   169  				// running the upgraded Kubernetes version
   170  				t.Logs.Infof("Terminating compute instance: %s", node.Spec.ProviderID)
   171  				err = terminateComputeInstance(node.Spec.ProviderID)
   172  				Expect(err).ShouldNot(HaveOccurred())
   173  
   174  				latestNodes, err = waitForReplacementNode(latestNodes)
   175  				Expect(err).ShouldNot(HaveOccurred())
   176  
   177  				// wait for all pods to be ready before continuing to the next node
   178  				t.Logs.Infof("Waiting for all pods to be ready")
   179  				hacommon.EventuallyPodsReady(t.Logs, clientset)
   180  			}
   181  		}
   182  	})
   183  
   184  	t.It("validates the k8s version of each worker node in the node pool", func() {
   185  		if skipClusterVersionUpgrade {
   186  			t.Logs.Infof("%s=%v, skipping node pool verification", skipUpgradeEnvVar, skipClusterVersionUpgrade)
   187  			return
   188  		}
   189  		// get the nodes and check both the kube proxy and kubelet versions
   190  		nodes := hacommon.EventuallyGetNodes(clientset, t.Logs)
   191  		for _, node := range nodes.Items {
   192  			Expect(node.Status.NodeInfo.KubeProxyVersion).To(Equal(upgradeVersion), "kube proxy version is incorrect")
   193  			Expect(node.Status.NodeInfo.KubeletVersion).To(Equal(upgradeVersion), "kubelet version is incorrect")
   194  		}
   195  	})
   196  })
   197  
   198  // waitForWorkRequest waits for the work request to transition to success
   199  func waitForWorkRequest(workRequestID string, timeout time.Duration) {
   200  	Eventually(func() (ocice.WorkRequestStatusEnum, error) {
   201  		t.Logs.Infof("Waiting for work request with id %s to complete", workRequestID)
   202  		workRequestResponse, err := okeClient.GetWorkRequest(context.Background(), ocice.GetWorkRequestRequest{WorkRequestId: &workRequestID})
   203  		if err != nil {
   204  			return "", err
   205  		}
   206  		t.Logs.Debugf("Work request response: %+v", workRequestResponse)
   207  		return workRequestResponse.Status, nil
   208  	}).WithTimeout(timeout).WithPolling(pollingInterval).Should(Equal(ocice.WorkRequestStatusSucceeded))
   209  }
   210  
   211  // terminateComputeInstance terminates a compute instance
   212  func terminateComputeInstance(instanceID string) error {
   213  	_, err := computeClient.TerminateInstance(context.Background(), ocicore.TerminateInstanceRequest{InstanceId: &instanceID})
   214  	if err != nil {
   215  		return err
   216  	}
   217  	return nil
   218  }
   219  
   220  // waitForReplacementNode waits for a replacement node to be ready. It returns the new list of nodes that includes
   221  // the replacement node.
   222  func waitForReplacementNode(existingNodes *corev1.NodeList) (*corev1.NodeList, error) {
   223  	var replacement string
   224  	var latestNodes *corev1.NodeList
   225  
   226  	Eventually(func() string {
   227  		t.Logs.Infof("Waiting for replacement worker node")
   228  		latestNodes = hacommon.EventuallyGetNodes(clientset, t.Logs)
   229  		for _, node := range latestNodes.Items {
   230  			if !hacommon.IsControlPlaneNode(node) {
   231  				if !isExistingNode(node, existingNodes) {
   232  					replacement = node.Name
   233  					break
   234  				}
   235  			}
   236  		}
   237  		return replacement
   238  	}).WithTimeout(waitTimeout).WithPolling(pollingInterval).ShouldNot(BeEmpty())
   239  
   240  	if len(replacement) == 0 {
   241  		return nil, errors.New("timed out waiting for new worker to be added to node pool")
   242  	}
   243  
   244  	Eventually(func() (bool, error) {
   245  		t.Logs.Infof("Waiting for new worker node %s to be ready", replacement)
   246  		return isNodeReady(replacement)
   247  	}).WithTimeout(waitTimeout).WithPolling(pollingInterval).Should(BeTrue())
   248  
   249  	return latestNodes, nil
   250  }
   251  
   252  // isExistingNode returns true if the specified node is in the list of existing nodes
   253  func isExistingNode(node corev1.Node, existingNodes *corev1.NodeList) bool {
   254  	for _, existingNode := range existingNodes.Items {
   255  		if node.Name == existingNode.Name {
   256  			return true
   257  		}
   258  	}
   259  	return false
   260  }
   261  
   262  // isNodeReady returns true if the NodeReady condition is true
   263  func isNodeReady(name string) (bool, error) {
   264  	node, err := clientset.CoreV1().Nodes().Get(context.TODO(), name, metav1.GetOptions{})
   265  	if err != nil {
   266  		return false, err
   267  	}
   268  
   269  	for _, condition := range node.Status.Conditions {
   270  		if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue {
   271  			return true, nil
   272  		}
   273  	}
   274  
   275  	return false, nil
   276  }
   277  
   278  // getOCISDKProvider returns an OCI SDK configuration provider. If a region is specified then
   279  // use an instance principal auth provider, otherwise use the default provider (auth config comes from
   280  // an OCI config file or environment variables).
   281  func getOCISDKProvider(region string) (common.ConfigurationProvider, error) {
   282  	var provider common.ConfigurationProvider
   283  	var err error
   284  
   285  	if region != "" {
   286  		t.Logs.Infof("Using OCI SDK instance principal provider with region: %s", region)
   287  		provider, err = auth.InstancePrincipalConfigurationProviderForRegion(common.StringToRegion(region))
   288  	} else {
   289  		t.Logs.Info("Using OCI SDK default provider")
   290  		provider = common.DefaultConfigProvider()
   291  	}
   292  
   293  	if err != nil {
   294  		return nil, err
   295  	}
   296  
   297  	defaultRetryPolicy := common.DefaultRetryPolicy()
   298  	common.GlobalRetry = &defaultRetryPolicy
   299  	return provider, nil
   300  }