k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e_node/checkpoint_container.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package e2enode 18 19 import ( 20 "archive/tar" 21 "context" 22 "encoding/json" 23 "fmt" 24 "io" 25 "net/http" 26 "os" 27 "strings" 28 "time" 29 30 "github.com/onsi/ginkgo/v2" 31 v1 "k8s.io/api/core/v1" 32 apierrors "k8s.io/apimachinery/pkg/api/errors" 33 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 clientset "k8s.io/client-go/kubernetes" 35 restclient "k8s.io/client-go/rest" 36 "k8s.io/kubernetes/test/e2e/framework" 37 e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" 38 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 39 "k8s.io/kubernetes/test/e2e/nodefeature" 40 testutils "k8s.io/kubernetes/test/utils" 41 imageutils "k8s.io/kubernetes/test/utils/image" 42 admissionapi "k8s.io/pod-security-admission/api" 43 44 "github.com/onsi/gomega" 45 ) 46 47 const ( 48 // timeout for proxy requests. 49 proxyTimeout = 2 * time.Minute 50 ) 51 52 type checkpointResult struct { 53 Items []string `json:"items"` 54 } 55 56 // proxyPostRequest performs a post on a node proxy endpoint given the nodename and rest client. 57 func proxyPostRequest(ctx context.Context, c clientset.Interface, node, endpoint string, port int) (restclient.Result, error) { 58 // proxy tends to hang in some cases when Node is not ready. Add an artificial timeout for this call. #22165 59 var result restclient.Result 60 finished := make(chan struct{}, 1) 61 go func() { 62 result = c.CoreV1().RESTClient().Post(). 63 Resource("nodes"). 64 SubResource("proxy"). 65 Name(fmt.Sprintf("%v:%v", node, port)). 66 Suffix(endpoint). 67 Do(ctx) 68 69 finished <- struct{}{} 70 }() 71 select { 72 case <-finished: 73 return result, nil 74 case <-ctx.Done(): 75 return restclient.Result{}, nil 76 case <-time.After(proxyTimeout): 77 return restclient.Result{}, nil 78 } 79 } 80 81 func getCheckpointContainerMetric(ctx context.Context, f *framework.Framework, pod *v1.Pod) (int, error) { 82 framework.Logf("Getting 'checkpoint_container' metrics from %q", pod.Spec.NodeName) 83 ms, err := e2emetrics.GetKubeletMetrics( 84 ctx, 85 f.ClientSet, 86 pod.Spec.NodeName, 87 ) 88 if err != nil { 89 return 0, err 90 } 91 92 runtimeOperationsTotal, ok := ms["runtime_operations_total"] 93 if !ok { 94 // If the metric was not found it was probably not written to, yet. 95 return 0, nil 96 } 97 98 for _, item := range runtimeOperationsTotal { 99 if item.Metric["__name__"] == "kubelet_runtime_operations_total" && item.Metric["operation_type"] == "checkpoint_container" { 100 return int(item.Value), nil 101 } 102 } 103 // If the metric was not found it was probably not written to, yet. 104 return 0, nil 105 } 106 107 func getCheckpointContainerErrorMetric(ctx context.Context, f *framework.Framework, pod *v1.Pod) (int, error) { 108 framework.Logf("Getting 'checkpoint_container' error metrics from %q", pod.Spec.NodeName) 109 ms, err := e2emetrics.GetKubeletMetrics( 110 ctx, 111 f.ClientSet, 112 pod.Spec.NodeName, 113 ) 114 if err != nil { 115 return 0, err 116 } 117 118 runtimeOperationsErrorsTotal, ok := ms["runtime_operations_errors_total"] 119 if !ok { 120 // If the metric was not found it was probably not written to, yet. 121 return 0, nil 122 } 123 124 for _, item := range runtimeOperationsErrorsTotal { 125 if item.Metric["__name__"] == "kubelet_runtime_operations_errors_total" && item.Metric["operation_type"] == "checkpoint_container" { 126 return int(item.Value), nil 127 } 128 } 129 // If the metric was not found it was probably not written to, yet. 130 return 0, nil 131 } 132 133 var _ = SIGDescribe("Checkpoint Container", nodefeature.CheckpointContainer, func() { 134 f := framework.NewDefaultFramework("checkpoint-container-test") 135 f.NamespacePodSecurityLevel = admissionapi.LevelBaseline 136 ginkgo.It("will checkpoint a container out of a pod", func(ctx context.Context) { 137 ginkgo.By("creating a target pod") 138 podClient := e2epod.NewPodClient(f) 139 pod := podClient.CreateSync(ctx, &v1.Pod{ 140 ObjectMeta: metav1.ObjectMeta{ 141 Name: "checkpoint-container-pod", 142 Namespace: f.Namespace.Name, 143 }, 144 Spec: v1.PodSpec{ 145 Containers: []v1.Container{ 146 { 147 Name: "test-container-1", 148 Image: imageutils.GetE2EImage(imageutils.BusyBox), 149 Command: []string{"/bin/sleep"}, 150 Args: []string{"10000"}, 151 }, 152 }, 153 }, 154 }) 155 156 p, err := podClient.Get( 157 ctx, 158 pod.Name, 159 metav1.GetOptions{}, 160 ) 161 162 framework.ExpectNoError(err) 163 isReady, err := testutils.PodRunningReady(p) 164 framework.ExpectNoError(err) 165 if !isReady { 166 framework.Failf("pod %q should be ready", p.Name) 167 } 168 169 // No checkpoint operation should have been logged 170 checkpointContainerMetric, err := getCheckpointContainerMetric(ctx, f, pod) 171 framework.ExpectNoError(err) 172 gomega.Expect(checkpointContainerMetric).To(gomega.Equal(0)) 173 // No error should have been logged 174 checkpointContainerErrorMetric, err := getCheckpointContainerErrorMetric(ctx, f, pod) 175 framework.ExpectNoError(err) 176 gomega.Expect(checkpointContainerErrorMetric).To(gomega.Equal(0)) 177 178 framework.Logf( 179 "About to checkpoint container %q on %q", 180 pod.Spec.Containers[0].Name, 181 pod.Spec.NodeName, 182 ) 183 result, err := proxyPostRequest( 184 ctx, 185 f.ClientSet, 186 pod.Spec.NodeName, 187 fmt.Sprintf( 188 "checkpoint/%s/%s/%s", 189 f.Namespace.Name, 190 pod.Name, 191 pod.Spec.Containers[0].Name, 192 ), 193 framework.KubeletPort, 194 ) 195 196 framework.ExpectNoError(err) 197 198 err = result.Error() 199 if err != nil { 200 statusError, ok := err.(*apierrors.StatusError) 201 if !ok { 202 framework.Failf("got error %#v, expected StatusError", err) 203 } 204 // If we are testing against a kubelet with ContainerCheckpoint == false 205 // we should get a 404. So a 404 is (also) a good sign. 206 if (int(statusError.ErrStatus.Code)) == http.StatusNotFound { 207 ginkgo.Skip("Feature 'ContainerCheckpoint' is not enabled and not available") 208 return 209 } 210 211 // If the container engine has not implemented the Checkpoint CRI API 212 // we will get 500 and a message with 213 // '(rpc error: code = Unimplemented desc = unknown method CheckpointContainer' 214 // or 215 // '(rpc error: code = Unimplemented desc = method CheckpointContainer not implemented)' 216 // if the container engine returns that it explicitly has disabled support for it. 217 // or 218 // '(rpc error: code = Unknown desc = checkpoint/restore support not available)' 219 // if the container engine explicitly disabled the checkpoint/restore support 220 // or 221 // '(rpc error: code = Unknown desc = CRIU binary not found or too old (<31600). Failed to checkpoint container' 222 // if the CRIU binary was not found if it is too old 223 if (int(statusError.ErrStatus.Code)) == http.StatusInternalServerError { 224 if strings.Contains( 225 statusError.ErrStatus.Message, 226 "(rpc error: code = Unimplemented desc = unknown method CheckpointContainer", 227 ) { 228 ginkgo.Skip("Container engine does not implement 'CheckpointContainer'") 229 return 230 } 231 if strings.Contains( 232 statusError.ErrStatus.Message, 233 "(rpc error: code = Unimplemented desc = method CheckpointContainer not implemented)", 234 ) { 235 ginkgo.Skip("Container engine does not implement 'CheckpointContainer'") 236 return 237 } 238 if strings.Contains( 239 statusError.ErrStatus.Message, 240 "(rpc error: code = Unknown desc = checkpoint/restore support not available)", 241 ) { 242 ginkgo.Skip("Container engine does not implement 'CheckpointContainer'") 243 return 244 } 245 if strings.Contains( 246 statusError.ErrStatus.Message, 247 "(rpc error: code = Unknown desc = CRIU binary not found or too old (<31600). Failed to checkpoint container", 248 ) { 249 ginkgo.Skip("Container engine reports missing or too old CRIU binary") 250 return 251 } 252 } 253 framework.Failf( 254 "Unexpected status code (%d) during 'CheckpointContainer': %q", 255 statusError.ErrStatus.Code, 256 statusError.ErrStatus.Message, 257 ) 258 } 259 260 framework.ExpectNoError(err) 261 262 // Checkpointing actually worked. Verify that the checkpoint exists and that 263 // it is a checkpoint. 264 265 raw, err := result.Raw() 266 framework.ExpectNoError(err) 267 answer := checkpointResult{} 268 err = json.Unmarshal(raw, &answer) 269 framework.ExpectNoError(err) 270 271 for _, item := range answer.Items { 272 // Check that the file exists 273 _, err := os.Stat(item) 274 framework.ExpectNoError(err) 275 // Check the content of the tar file 276 // At least looking for the following files 277 // * spec.dump 278 // * config.dump 279 // * checkpoint/inventory.img 280 // If these files exist in the checkpoint archive it is 281 // probably a complete checkpoint. 282 checkForFiles := map[string]bool{ 283 "spec.dump": false, 284 "config.dump": false, 285 "checkpoint/inventory.img": false, 286 } 287 fileReader, err := os.Open(item) 288 framework.ExpectNoError(err) 289 tr := tar.NewReader(fileReader) 290 for { 291 hdr, err := tr.Next() 292 if err == io.EOF { 293 // End of archive 294 break 295 } 296 framework.ExpectNoError(err) 297 if _, key := checkForFiles[hdr.Name]; key { 298 checkForFiles[hdr.Name] = true 299 } 300 } 301 for fileName := range checkForFiles { 302 if !checkForFiles[fileName] { 303 framework.Failf("File %q not found in checkpoint archive %q", fileName, item) 304 } 305 } 306 // cleanup checkpoint archive 307 os.RemoveAll(item) 308 } 309 // Exactly one checkpoint operation should have happened 310 checkpointContainerMetric, err = getCheckpointContainerMetric(ctx, f, pod) 311 framework.ExpectNoError(err) 312 gomega.Expect(checkpointContainerMetric).To(gomega.Equal(1)) 313 // No error should have been logged 314 checkpointContainerErrorMetric, err = getCheckpointContainerErrorMetric(ctx, f, pod) 315 framework.ExpectNoError(err) 316 gomega.Expect(checkpointContainerErrorMetric).To(gomega.Equal(0)) 317 }) 318 })