github.com/verrazzano/verrazzano@v1.7.1/tests/e2e/ha/inplaceupgrade/in_place_upgrade_test.go (about) 1 // Copyright (c) 2022, 2023, Oracle and/or its affiliates. 2 // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl. 3 4 package inplaceupgrade 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "os" 11 "os/exec" 12 "strconv" 13 "time" 14 15 . "github.com/onsi/ginkgo/v2" 16 . "github.com/onsi/gomega" 17 "github.com/oracle/oci-go-sdk/v53/common" 18 "github.com/oracle/oci-go-sdk/v53/common/auth" 19 ocice "github.com/oracle/oci-go-sdk/v53/containerengine" 20 ocicore "github.com/oracle/oci-go-sdk/v53/core" 21 "github.com/verrazzano/verrazzano/pkg/k8sutil" 22 hacommon "github.com/verrazzano/verrazzano/tests/e2e/pkg/ha" 23 "github.com/verrazzano/verrazzano/tests/e2e/pkg/test/framework" 24 corev1 "k8s.io/api/core/v1" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 ) 27 28 const ( 29 clusterIDEnvVar = "OKE_CLUSTER_ID" 30 ociRegionEnvVar = "OCI_CLI_REGION" 31 skipUpgradeEnvVar = "SKIP_KUBERNETES_UPGRADE" 32 33 waitTimeout = 20 * time.Minute 34 waitTimeoutControlPlane = 2 * time.Hour 35 36 pollingInterval = 30 * time.Second 37 38 waitForDeleteTimeout = 600 * time.Second 39 ) 40 41 var clientset = k8sutil.GetKubernetesClientsetOrDie() 42 var t = framework.NewTestFramework("in_place_upgrade") 43 44 var ( 45 failed bool 46 region string 47 clusterID string 48 skipClusterVersionUpgrade bool 49 50 okeClient ocice.ContainerEngineClient 51 computeClient ocicore.ComputeClient 52 ) 53 54 var _ = t.AfterEach(func() { 55 failed = failed || CurrentSpecReport().Failed() 56 }) 57 58 var beforeSuite = t.BeforeSuiteFunc(func() { 59 clusterID = os.Getenv(clusterIDEnvVar) 60 region = os.Getenv(ociRegionEnvVar) 61 62 if skipUpgradeVal, set := os.LookupEnv(skipUpgradeEnvVar); set { 63 var parseErr error 64 skipClusterVersionUpgrade, parseErr = strconv.ParseBool(skipUpgradeVal) 65 Expect(parseErr).ShouldNot(HaveOccurred(), fmt.Sprintf("Invalid value for %s: %s", skipUpgradeVal, skipUpgradeVal)) 66 } 67 68 Expect(clusterID).ToNot(BeEmpty(), fmt.Sprintf("%s env var must be set", clusterIDEnvVar)) 69 // region is optional so don't Expect 70 71 var provider common.ConfigurationProvider 72 var err error 73 provider, err = getOCISDKProvider(region) 74 Expect(err).ShouldNot(HaveOccurred(), "Error configuring OCI SDK provider") 75 76 okeClient, err = ocice.NewContainerEngineClientWithConfigurationProvider(provider) 77 Expect(err).ShouldNot(HaveOccurred(), "Error configuring OCI SDK container engine client") 78 79 computeClient, err = ocicore.NewComputeClientWithConfigurationProvider(provider) 80 Expect(err).ShouldNot(HaveOccurred(), "Error configuring OCI SDK compute client") 81 }) 82 83 var _ = BeforeSuite(beforeSuite) 84 85 var afterSuite = t.AfterSuiteFunc(func() { 86 // signal that the upgrade is done so the tests know to stop 87 hacommon.EventuallyCreateShutdownSignal(clientset, t.Logs) 88 }) 89 90 var _ = AfterSuite(afterSuite) 91 92 var _ = t.Describe("OKE In-Place Upgrade", Label("f:platform-lcm:ha"), func() { 93 var clusterResponse ocice.GetClusterResponse 94 var upgradeVersion string 95 96 t.It("upgrades the control plane Kubernetes version", func() { 97 if skipClusterVersionUpgrade { 98 t.Logs.Infof("%s=%v, skipping cluster Control Plane upgrade", skipUpgradeEnvVar, skipClusterVersionUpgrade) 99 return 100 } 101 102 // first get the cluster details and find the available upgrade versions 103 var err error 104 clusterResponse, err = okeClient.GetCluster(context.Background(), ocice.GetClusterRequest{ClusterId: &clusterID}) 105 Expect(err).ShouldNot(HaveOccurred()) 106 t.Logs.Debugf("Cluster response: %+v", clusterResponse) 107 Expect(clusterResponse.AvailableKubernetesUpgrades).ToNot(BeEmpty(), "No available upgrade versions") 108 109 // upgrade the control plane to the first available upgrade version 110 upgradeVersion = clusterResponse.AvailableKubernetesUpgrades[0] 111 t.Logs.Infof("Upgrading the OKE cluster control plane to version: %s", upgradeVersion) 112 details := ocice.UpdateClusterDetails{KubernetesVersion: &upgradeVersion} 113 updateResponse, err := okeClient.UpdateCluster(context.Background(), ocice.UpdateClusterRequest{ClusterId: &clusterID, UpdateClusterDetails: details}) 114 Expect(err).ShouldNot(HaveOccurred()) 115 Expect(updateResponse.OpcWorkRequestId).ShouldNot(BeNil()) 116 117 // wait for the work request to complete, this can take roughly 5-15 minutes 118 waitForWorkRequest(*updateResponse.OpcWorkRequestId, waitTimeoutControlPlane) 119 }) 120 121 t.It("upgrades the node pool Kubernetes version", func() { 122 if skipClusterVersionUpgrade { 123 t.Logs.Infof("%s=%v, skipping node pool upgrade", skipUpgradeEnvVar, skipClusterVersionUpgrade) 124 return 125 } 126 // first get the node pool, the cluster response struct does not have node pools so we have to list the node pools 127 // in the compartment and filter by the cluster id 128 nodePoolsResponse, err := okeClient.ListNodePools(context.Background(), ocice.ListNodePoolsRequest{CompartmentId: clusterResponse.CompartmentId, ClusterId: clusterResponse.Id}) 129 Expect(err).ShouldNot(HaveOccurred()) 130 Expect(len(nodePoolsResponse.Items)).To(Equal(1)) 131 132 // upgrade the node pool to the same Kubernetes version as the control plane 133 t.Logs.Infof("Upgrading the OKE cluster node pool to version: %s", upgradeVersion) 134 details := ocice.UpdateNodePoolDetails{KubernetesVersion: &upgradeVersion} 135 updateResponse, err := okeClient.UpdateNodePool(context.Background(), ocice.UpdateNodePoolRequest{NodePoolId: nodePoolsResponse.Items[0].Id, UpdateNodePoolDetails: details}) 136 Expect(err).ShouldNot(HaveOccurred()) 137 Expect(updateResponse.OpcWorkRequestId).ShouldNot(BeNil()) 138 139 // wait for the work request to complete 140 waitForWorkRequest(*updateResponse.OpcWorkRequestId, waitTimeout) 141 }) 142 143 t.It("replaces each worker node in the node pool", func() { 144 // get the nodes 145 nodes := hacommon.EventuallyGetNodes(clientset, t.Logs) 146 latestNodes := nodes 147 for _, node := range nodes.Items { 148 if !hacommon.IsControlPlaneNode(node) { 149 // cordon and drain the node - this function is implemented in kubectl itself and is not available 150 // using a k8s client 151 t.Logs.Infof("Draining node: %s", node.Name) 152 kubectlArgs := []string{ 153 "drain", 154 "--ignore-daemonsets", 155 "--delete-emptydir-data", 156 "--force", 157 fmt.Sprintf("--skip-wait-for-delete-timeout=%v", int(waitForDeleteTimeout.Seconds())), 158 "--timeout=15m", 159 node.Name, 160 } 161 out, err := exec.Command("kubectl", kubectlArgs...).CombinedOutput() //nolint:gosec //#nosec G204 162 t.Logs.Infof("Combined output from kubectl drain command: %s", out) 163 if err != nil { 164 t.Logs.Infof("Error occurred running kubectl drain command: %s", err.Error()) 165 } 166 Expect(err).ShouldNot(HaveOccurred()) 167 168 // terminate the compute instance that the node is on, OKE will replace it with a new node 169 // running the upgraded Kubernetes version 170 t.Logs.Infof("Terminating compute instance: %s", node.Spec.ProviderID) 171 err = terminateComputeInstance(node.Spec.ProviderID) 172 Expect(err).ShouldNot(HaveOccurred()) 173 174 latestNodes, err = waitForReplacementNode(latestNodes) 175 Expect(err).ShouldNot(HaveOccurred()) 176 177 // wait for all pods to be ready before continuing to the next node 178 t.Logs.Infof("Waiting for all pods to be ready") 179 hacommon.EventuallyPodsReady(t.Logs, clientset) 180 } 181 } 182 }) 183 184 t.It("validates the k8s version of each worker node in the node pool", func() { 185 if skipClusterVersionUpgrade { 186 t.Logs.Infof("%s=%v, skipping node pool verification", skipUpgradeEnvVar, skipClusterVersionUpgrade) 187 return 188 } 189 // get the nodes and check both the kube proxy and kubelet versions 190 nodes := hacommon.EventuallyGetNodes(clientset, t.Logs) 191 for _, node := range nodes.Items { 192 Expect(node.Status.NodeInfo.KubeProxyVersion).To(Equal(upgradeVersion), "kube proxy version is incorrect") 193 Expect(node.Status.NodeInfo.KubeletVersion).To(Equal(upgradeVersion), "kubelet version is incorrect") 194 } 195 }) 196 }) 197 198 // waitForWorkRequest waits for the work request to transition to success 199 func waitForWorkRequest(workRequestID string, timeout time.Duration) { 200 Eventually(func() (ocice.WorkRequestStatusEnum, error) { 201 t.Logs.Infof("Waiting for work request with id %s to complete", workRequestID) 202 workRequestResponse, err := okeClient.GetWorkRequest(context.Background(), ocice.GetWorkRequestRequest{WorkRequestId: &workRequestID}) 203 if err != nil { 204 return "", err 205 } 206 t.Logs.Debugf("Work request response: %+v", workRequestResponse) 207 return workRequestResponse.Status, nil 208 }).WithTimeout(timeout).WithPolling(pollingInterval).Should(Equal(ocice.WorkRequestStatusSucceeded)) 209 } 210 211 // terminateComputeInstance terminates a compute instance 212 func terminateComputeInstance(instanceID string) error { 213 _, err := computeClient.TerminateInstance(context.Background(), ocicore.TerminateInstanceRequest{InstanceId: &instanceID}) 214 if err != nil { 215 return err 216 } 217 return nil 218 } 219 220 // waitForReplacementNode waits for a replacement node to be ready. It returns the new list of nodes that includes 221 // the replacement node. 222 func waitForReplacementNode(existingNodes *corev1.NodeList) (*corev1.NodeList, error) { 223 var replacement string 224 var latestNodes *corev1.NodeList 225 226 Eventually(func() string { 227 t.Logs.Infof("Waiting for replacement worker node") 228 latestNodes = hacommon.EventuallyGetNodes(clientset, t.Logs) 229 for _, node := range latestNodes.Items { 230 if !hacommon.IsControlPlaneNode(node) { 231 if !isExistingNode(node, existingNodes) { 232 replacement = node.Name 233 break 234 } 235 } 236 } 237 return replacement 238 }).WithTimeout(waitTimeout).WithPolling(pollingInterval).ShouldNot(BeEmpty()) 239 240 if len(replacement) == 0 { 241 return nil, errors.New("timed out waiting for new worker to be added to node pool") 242 } 243 244 Eventually(func() (bool, error) { 245 t.Logs.Infof("Waiting for new worker node %s to be ready", replacement) 246 return isNodeReady(replacement) 247 }).WithTimeout(waitTimeout).WithPolling(pollingInterval).Should(BeTrue()) 248 249 return latestNodes, nil 250 } 251 252 // isExistingNode returns true if the specified node is in the list of existing nodes 253 func isExistingNode(node corev1.Node, existingNodes *corev1.NodeList) bool { 254 for _, existingNode := range existingNodes.Items { 255 if node.Name == existingNode.Name { 256 return true 257 } 258 } 259 return false 260 } 261 262 // isNodeReady returns true if the NodeReady condition is true 263 func isNodeReady(name string) (bool, error) { 264 node, err := clientset.CoreV1().Nodes().Get(context.TODO(), name, metav1.GetOptions{}) 265 if err != nil { 266 return false, err 267 } 268 269 for _, condition := range node.Status.Conditions { 270 if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue { 271 return true, nil 272 } 273 } 274 275 return false, nil 276 } 277 278 // getOCISDKProvider returns an OCI SDK configuration provider. If a region is specified then 279 // use an instance principal auth provider, otherwise use the default provider (auth config comes from 280 // an OCI config file or environment variables). 281 func getOCISDKProvider(region string) (common.ConfigurationProvider, error) { 282 var provider common.ConfigurationProvider 283 var err error 284 285 if region != "" { 286 t.Logs.Infof("Using OCI SDK instance principal provider with region: %s", region) 287 provider, err = auth.InstancePrincipalConfigurationProviderForRegion(common.StringToRegion(region)) 288 } else { 289 t.Logs.Info("Using OCI SDK default provider") 290 provider = common.DefaultConfigProvider() 291 } 292 293 if err != nil { 294 return nil, err 295 } 296 297 defaultRetryPolicy := common.DefaultRetryPolicy() 298 common.GlobalRetry = &defaultRetryPolicy 299 return provider, nil 300 }