gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/container/metric_server_test.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package container 16 17 import ( 18 "context" 19 "fmt" 20 "io/ioutil" 21 "os" 22 "path/filepath" 23 "strconv" 24 "strings" 25 "testing" 26 "time" 27 28 "github.com/google/go-cmp/cmp" 29 specs "github.com/opencontainers/runtime-spec/specs-go" 30 "gvisor.dev/gvisor/pkg/abi/linux" 31 "gvisor.dev/gvisor/pkg/cleanup" 32 "gvisor.dev/gvisor/pkg/test/testutil" 33 "gvisor.dev/gvisor/runsc/config" 34 "gvisor.dev/gvisor/test/metricclient" 35 ) 36 37 const ( 38 // podAnnotation contains the name of the pod that a sandbox represents when running in 39 // Kubernetes. 40 podAnnotation = "io.kubernetes.cri.sandbox-name" 41 // namespaceAnnotation contains the name of the namespace that a sandbox is in when running in 42 // Kubernetes. 43 namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace" 44 ) 45 46 // metricsTest is returned by setupMetrics. 47 type metricsTest struct { 48 testCtx context.Context 49 rootDir string 50 bundleDir string 51 sleepSpec *specs.Spec 52 sleepConf *config.Config 53 udsPath string 54 client *metricclient.MetricClient 55 serverExtraArgs []string 56 } 57 58 // applyConf applies metric-server-related configuration options to the given config. 59 // Returns the passed-in config itself. 60 func (mt *metricsTest) applyConf(conf *config.Config) *config.Config { 61 conf.MetricServer = mt.sleepConf.MetricServer 62 conf.RootDir = mt.rootDir 63 return conf 64 } 65 66 // setupMetrics sets up a container configuration with metrics enabled, and returns it all. 67 // Also returns a cleanup function. 68 func setupMetrics(t *testing.T, forceTempUDS bool) (*metricsTest, func()) { 69 // Start the child reaper. 70 childReaper := &testutil.Reaper{} 71 childReaper.Start() 72 cu := cleanup.Make(childReaper.Stop) 73 74 cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 3*time.Minute+30*time.Second) 75 cu.Add(cleanupCancel) 76 testCtx, testCancel := context.WithTimeout(cleanupCtx, 3*time.Minute) 77 cu.Add(testCancel) 78 79 spec, conf := sleepSpecConf(t) 80 conf.MetricServer = "%RUNTIME_ROOT%/metrics.sock" 81 serverExtraArgs := []string{"--exporter-prefix=testmetric_"} 82 rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) 83 if err != nil { 84 t.Fatalf("error setting up container: %v", err) 85 } 86 cu.Add(cleanup) 87 tmpDir, err := os.MkdirTemp("/tmp", "metrics-") 88 if err != nil { 89 t.Fatalf("Cannot create temporary directory in /tmp: %v", err) 90 } 91 cu.Add(func() { os.RemoveAll(tmpDir) }) 92 udsPath := filepath.Join(rootDir, "metrics.sock") 93 if forceTempUDS || len(udsPath) >= 100 { 94 udsPath = filepath.Join(tmpDir, "metrics.sock") 95 } 96 if len(udsPath) >= 100 { 97 t.Fatalf("Cannot come up with a UDS path shorter than the maximum length allowed by Linux (tried to use %q)", udsPath) 98 } 99 conf.MetricServer = udsPath 100 // The UDS should be deleted by the metrics server itself, but we clean it up here anyway just in case: 101 cu.Add(func() { os.Remove(udsPath) }) 102 103 metricClient := metricclient.NewMetricClient(udsPath, rootDir) 104 if err := metricClient.SpawnServer(testCtx, conf, serverExtraArgs...); err != nil { 105 t.Fatalf("Cannot start metric server: %v", err) 106 } 107 cu.Add(func() { metricClient.ShutdownServer(cleanupCtx) }) 108 109 return &metricsTest{ 110 testCtx: testCtx, 111 rootDir: rootDir, 112 bundleDir: bundleDir, 113 sleepSpec: spec, 114 sleepConf: conf, 115 udsPath: udsPath, 116 client: metricClient, 117 serverExtraArgs: serverExtraArgs, 118 }, cu.Clean 119 } 120 121 // TestContainerMetrics verifies basic functionality of the metric server works. 122 func TestContainerMetrics(t *testing.T) { 123 targetOpens := 200 124 125 te, cleanup := setupMetrics(t /* forceTempUDS= */, false) 126 defer cleanup() 127 128 if _, err := te.client.GetMetrics(te.testCtx, nil); err != nil { 129 t.Fatal("GetMetrics failed prior to container start") 130 } 131 if te.sleepSpec.Annotations == nil { 132 te.sleepSpec.Annotations = make(map[string]string) 133 } 134 te.sleepSpec.Annotations[podAnnotation] = "foopod" 135 te.sleepSpec.Annotations[namespaceAnnotation] = "foons" 136 args := Args{ 137 ID: testutil.RandomContainerID(), 138 Spec: te.sleepSpec, 139 BundleDir: te.bundleDir, 140 } 141 cont, err := New(te.sleepConf, args) 142 if err != nil { 143 t.Fatalf("error creating container: %v", err) 144 } 145 defer cont.Destroy() 146 udsStat, udsStatErr := os.Stat(te.udsPath) 147 if udsStatErr != nil { 148 t.Fatalf("Stat(%s) failed after creating container: %v", te.udsPath, udsStatErr) 149 } 150 if udsStat.Mode()&os.ModeSocket == 0 { 151 t.Errorf("Stat(%s): Got mode %x, expected socket (mode %x)", te.udsPath, udsStat.Mode(), os.ModeSocket) 152 } 153 initialData, err := te.client.GetMetrics(te.testCtx, nil) 154 if err != nil { 155 t.Errorf("Cannot get metrics after creating container: %v", err) 156 } 157 gotSandboxMetadata, err := initialData.GetSandboxMetadataMetric(metricclient.WantMetric{ 158 Metric: "testmetric_meta_sandbox_metadata", 159 Sandbox: args.ID, 160 Pod: "foopod", 161 Namespace: "foons", 162 }) 163 if err != nil { 164 t.Errorf("Cannot get sandbox metadata: %v", err) 165 } 166 if gotSandboxMetadata["platform"] == "" || gotSandboxMetadata["platform"] != te.sleepConf.Platform { 167 t.Errorf("Invalid platform: Metric metadata says %v, config says %v", gotSandboxMetadata["platform"], te.sleepConf.Platform) 168 } 169 gotSpecMetadata, err := initialData.GetSandboxMetadataMetric(metricclient.WantMetric{ 170 Metric: "testmetric_meta_spec_metadata", 171 Sandbox: args.ID, 172 Pod: "foopod", 173 Namespace: "foons", 174 }) 175 if err != nil { 176 t.Errorf("Cannot get spec metadata: %v", err) 177 } 178 if gotSpecMetadata["hasuid0"] == "" || (gotSpecMetadata["hasuid0"] != "true" && gotSpecMetadata["hasuid0"] != "false") { 179 t.Errorf("Invalid or absent hasuid0 key from spec metadata: %v", gotSpecMetadata["hasuid0"]) 180 } 181 t.Logf("Metrics prior to container start:\n\n%s\n\n", initialData) 182 if err := cont.Start(te.sleepConf); err != nil { 183 t.Fatalf("Cannot start container: %v", err) 184 } 185 postStartData, err := te.client.GetMetrics(te.testCtx, nil) 186 if err != nil { 187 t.Fatalf("Cannot get metrics after starting container: %v", err) 188 } 189 postStartOpens, postStartTimestamp, err := postStartData.GetPrometheusContainerInteger(metricclient.WantMetric{ 190 Metric: "testmetric_fs_opens", 191 Sandbox: args.ID, 192 Pod: "foopod", 193 Namespace: "foons", 194 }) 195 if err != nil { 196 t.Errorf("Cannot get testmetric_fs_opens from following data (err: %v):\n\n%s\n\n", err, postStartData) 197 } 198 t.Logf("After container start, fs_opens=%d (snapshotted at %v)", postStartOpens, postStartTimestamp) 199 // The touch operation may fail from permission errors, but the metric should still be incremented. 200 shOutput, err := executeCombinedOutput(te.sleepConf, cont, nil, "/bin/bash", "-c", fmt.Sprintf("for i in $(seq 1 %d); do touch /tmp/$i || true; done", targetOpens)) 201 if err != nil { 202 t.Fatalf("Exec failed: %v; output: %v", err, shOutput) 203 } 204 postExecData, err := te.client.GetMetrics(te.testCtx, nil) 205 if err != nil { 206 t.Fatalf("Cannot get metrics after a bunch of open() calls: %v", err) 207 } 208 postExecOpens, postExecTimestamp, err := postExecData.GetPrometheusContainerInteger(metricclient.WantMetric{ 209 Metric: "testmetric_fs_opens", 210 Sandbox: args.ID, 211 Pod: "foopod", 212 Namespace: "foons", 213 }) 214 if err != nil { 215 t.Errorf("Cannot get testmetric_fs_opens from following data (err: %v):\n\n%s\n\n", err, postExecData) 216 } 217 t.Logf("After exec'ing %d open()s, fs_opens=%d (snapshotted at %v)", targetOpens, postExecOpens, postExecTimestamp) 218 diffOpens := postExecOpens - postStartOpens 219 if diffOpens < int64(targetOpens) { 220 t.Errorf("testmetric_fs_opens went from %d to %d (diff: %d), expected the difference to be at least %d", postStartOpens, postExecOpens, diffOpens, targetOpens) 221 } 222 } 223 224 // TestContainerMetricsIterationID verifies that two successive containers with the same ID 225 // do not have the same iteration ID. 226 func TestContainerMetricsIterationID(t *testing.T) { 227 te, cleanup := setupMetrics(t /* forceTempUDS= */, false) 228 defer cleanup() 229 230 args := Args{ 231 ID: testutil.RandomContainerID(), 232 Spec: te.sleepSpec, 233 BundleDir: te.bundleDir, 234 } 235 cont1, err := New(te.sleepConf, args) 236 if err != nil { 237 t.Fatalf("error creating container 1: %v", err) 238 } 239 defer cont1.Destroy() 240 data1, err := te.client.GetMetrics(te.testCtx, nil) 241 if err != nil { 242 t.Errorf("Cannot get metrics after creating container 1: %v", err) 243 } 244 metadata1, err := data1.GetSandboxMetadataMetric(metricclient.WantMetric{ 245 Metric: "testmetric_meta_sandbox_metadata", 246 Sandbox: args.ID, 247 }) 248 if err != nil { 249 t.Errorf("Cannot get sandbox 1 metadata: %v", err) 250 } 251 t.Logf("Container 1 metadata: %v", metadata1) 252 iterationID1 := metadata1["iteration"] 253 if iterationID1 == "" { 254 t.Fatalf("Cannot find iteration ID in metadata 1: %v", metadata1) 255 } 256 if err := cont1.Destroy(); err != nil && !strings.Contains(err.Error(), "no child process") { 257 t.Fatalf("Cannot destroy container 1: %v", err) 258 } 259 cont2, err := New(te.sleepConf, args) 260 if err != nil { 261 t.Fatalf("error creating container 2: %v", err) 262 } 263 defer cont2.Destroy() 264 data2, err := te.client.GetMetrics(te.testCtx, nil) 265 if err != nil { 266 t.Errorf("Cannot get metrics after creating container 2: %v", err) 267 } 268 metadata2, err := data2.GetSandboxMetadataMetric(metricclient.WantMetric{ 269 Metric: "testmetric_meta_sandbox_metadata", 270 Sandbox: args.ID, 271 }) 272 if err != nil { 273 t.Errorf("Cannot get sandbox 2 metadata: %v", err) 274 } 275 t.Logf("Container 2 metadata: %v", metadata2) 276 iterationID2 := metadata2["iteration"] 277 if iterationID2 == "" { 278 t.Fatalf("Cannot find iteration ID in metadata 2: %v", metadata2) 279 } 280 if iterationID1 == iterationID2 { 281 t.Errorf("Iteration IDs of successive instances with the same ID unexpectedly matched: %v", iterationID1) 282 } 283 } 284 285 // TestContainerMetricsRobustAgainstRestarts that exporting metrics is robust against metric server 286 // unavailability or restarts. 287 func TestContainerMetricsRobustAgainstRestarts(t *testing.T) { 288 targetOpens := 200 289 te, cleanup := setupMetrics(t /* forceTempUDS= */, false) 290 defer cleanup() 291 292 // First, start a container which will kick off the metric server as normal. 293 args := Args{ 294 ID: testutil.RandomContainerID(), 295 Spec: te.sleepSpec, 296 BundleDir: te.bundleDir, 297 } 298 cont, err := New(te.sleepConf, args) 299 if err != nil { 300 t.Fatalf("error creating container: %v", err) 301 } 302 defer cont.Destroy() 303 if err := cont.Start(te.sleepConf); err != nil { 304 t.Fatalf("Cannot start container: %v", err) 305 } 306 shOutput, err := executeCombinedOutput(te.sleepConf, cont, nil, "/bin/bash", "-c", fmt.Sprintf("for i in $(seq 1 %d); do touch /tmp/$i || true; done", targetOpens)) 307 if err != nil { 308 t.Fatalf("Exec failed: %v; output: %v", err, shOutput) 309 } 310 preRestartData, err := te.client.GetMetrics(te.testCtx, nil) 311 if err != nil { 312 t.Fatalf("Cannot get metrics after a bunch of open() calls: %v", err) 313 } 314 315 // Retain the value of fs_opens for the first container. We'll use it when comparing to the data 316 // from the restarted metric server. 317 preRestartOpens, postExecTimestamp, err := preRestartData.GetPrometheusContainerInteger(metricclient.WantMetric{ 318 Metric: "testmetric_fs_opens", 319 Sandbox: args.ID, 320 }) 321 if err != nil { 322 t.Errorf("Cannot get testmetric_fs_opens from following data (err: %v):\n\n%s\n\n", err, preRestartData) 323 } 324 preRestartMetadata, err := preRestartData.GetSandboxMetadataMetric(metricclient.WantMetric{ 325 Metric: "testmetric_meta_sandbox_metadata", 326 Sandbox: args.ID, 327 }) 328 if err != nil { 329 t.Errorf("Cannot get sandbox metadata: %v", err) 330 } 331 t.Logf("After exec'ing %d open()s, fs_opens=%d (snapshotted at %v)", targetOpens, preRestartOpens, postExecTimestamp) 332 333 // Now shut down the metric server and verify we can no longer fetch metrics. 334 if err := te.client.ShutdownServer(te.testCtx); err != nil { 335 t.Fatalf("Cannot shutdown server: %v", err) 336 } 337 if rawData, err := te.client.GetMetrics(te.testCtx, nil); err == nil { 338 t.Fatalf("Unexpectedly was able to get metric data despite shutting down server:\n\n%s\n\n", rawData) 339 } 340 341 // Do a bunch of touches again. The metric server is down during this time. 342 // This verifies that metric value modifications does not depend on the metric server being up. 343 shOutput, err = executeCombinedOutput(te.sleepConf, cont, nil, "/bin/bash", "-c", fmt.Sprintf("for i in $(seq 1 %d); do touch /tmp/$i || true; done", targetOpens)) 344 if err != nil { 345 t.Fatalf("Exec failed: %v; output: %v", err, shOutput) 346 } 347 348 // Start a second container. 349 // This container should be picked up by a metric server we will start afterwards. 350 // This verifies that a metric server being down does not cause sandbox creation to fail. 351 args2 := Args{ 352 ID: testutil.RandomContainerID(), 353 Spec: te.sleepSpec, 354 BundleDir: te.bundleDir, 355 } 356 cont2, err := New(te.sleepConf, args2) 357 if err != nil { 358 t.Fatalf("error creating second container: %v", err) 359 } 360 defer cont2.Destroy() 361 if rawData, err := te.client.GetMetrics(te.testCtx, nil); err == nil { 362 t.Fatalf("Unexpectedly was able to get metric data after creating second container:\n\n%s\n\n", rawData) 363 } 364 if err := cont2.Start(te.sleepConf); err != nil { 365 t.Fatalf("Cannot start second container: %v", err) 366 } 367 if rawData, err := te.client.GetMetrics(te.testCtx, nil); err == nil { 368 t.Fatalf("Unexpectedly was able to get metric data after starting second container:\n\n%s\n\n", rawData) 369 } 370 371 // Start the metric server. 372 if err := te.client.SpawnServer(te.testCtx, te.sleepConf, te.serverExtraArgs...); err != nil { 373 t.Fatalf("Cannot re-spawn server: %v", err) 374 } 375 376 // Now start a third container. 377 // This should be picked up by the server we just started. 378 args3 := Args{ 379 ID: testutil.RandomContainerID(), 380 Spec: te.sleepSpec, 381 BundleDir: te.bundleDir, 382 } 383 cont3, err := New(te.sleepConf, args3) 384 if err != nil { 385 t.Fatalf("error creating second container: %v", err) 386 } 387 defer cont3.Destroy() 388 if err := cont3.Start(te.sleepConf); err != nil { 389 t.Fatalf("Cannot start third container: %v", err) 390 } 391 392 // Verify that the metric server was restarted and that we can indeed get all the data we expect 393 // from all the containers this test has started. 394 postRestartData, err := te.client.GetMetrics(te.testCtx, nil) 395 if err != nil { 396 t.Fatalf("Cannot get metrics after restarting server: %v", err) 397 } 398 postRestartOpens, _, err := postRestartData.GetPrometheusContainerInteger(metricclient.WantMetric{ 399 Metric: "testmetric_fs_opens", 400 Sandbox: args.ID, 401 }) 402 if err != nil { 403 t.Fatalf("Cannot get testmetric_fs_opens for first container (%s) from following data (err: %v):\n\n%s\n\n", args.ID, err, postRestartData) 404 } 405 if diff := postRestartOpens - preRestartOpens; diff < int64(targetOpens) { 406 t.Errorf("testmetric_fs_opens for first container did not increase by at least %d after metric server restart: went from %d to %d (diff: %d)", targetOpens, preRestartOpens, postRestartOpens, diff) 407 } 408 postRestartMetadata, err := postRestartData.GetSandboxMetadataMetric(metricclient.WantMetric{ 409 Metric: "testmetric_meta_sandbox_metadata", 410 Sandbox: args.ID, 411 }) 412 if err != nil { 413 t.Fatalf("Cannot get post-restart sandbox metadata: %v", err) 414 } 415 if diff := cmp.Diff(preRestartMetadata, postRestartMetadata); diff != "" { 416 t.Errorf("Sandbox metadata changed after restart:\nBefore: %v\nAfter: %v\nDiff: %v", preRestartMetadata, postRestartMetadata, diff) 417 } 418 _, _, err = postRestartData.GetPrometheusContainerInteger(metricclient.WantMetric{ 419 Metric: "testmetric_fs_opens", 420 Sandbox: args2.ID, 421 }) 422 if err != nil { 423 t.Fatalf("Cannot get testmetric_fs_opens for second container (%s) from following data (err: %v):\n\n%s\n\n", args2.ID, err, postRestartData) 424 } 425 _, _, err = postRestartData.GetPrometheusContainerInteger(metricclient.WantMetric{ 426 Metric: "testmetric_fs_opens", 427 Sandbox: args3.ID, 428 }) 429 if err != nil { 430 t.Fatalf("Cannot get testmetric_fs_opens for third container (%s) from following data (err: %v):\n\n%s\n\n", args3.ID, err, postRestartData) 431 } 432 } 433 434 // TestContainerMetricsMultiple verifies that the metric server spawned for one container 435 // serves metrics for all containers, and survives past its initial container's lifetime. 436 func TestContainerMetricsMultiple(t *testing.T) { 437 numConcurrentContainers := 5 438 439 te, cleanup := setupMetrics(t /* forceTempUDS= */, false) 440 defer cleanup() 441 var containers []*Container 442 needCleanup := map[*Container]struct{}{} 443 toDestroy := map[*Container]struct{}{} 444 defer func() { 445 for container := range needCleanup { 446 container.Destroy() 447 } 448 }() 449 450 // Start a bunch of containers with metrics. 451 for i := 0; i < numConcurrentContainers; i++ { 452 cont, err := New(te.sleepConf, Args{ 453 ID: testutil.RandomContainerID(), 454 Spec: te.sleepSpec, 455 BundleDir: te.bundleDir, 456 }) 457 if err != nil { 458 t.Fatalf("error creating container: %v", err) 459 } 460 containers = append(containers, cont) 461 needCleanup[cont] = struct{}{} 462 // Note that this includes the first container, which will be the one that 463 // starts the metrics server. 464 if i%2 == 0 { 465 toDestroy[cont] = struct{}{} 466 } 467 if err := cont.Start(te.sleepConf); err != nil { 468 t.Fatalf("Cannot start container: %v", err) 469 } 470 } 471 472 // Start one container with metrics turned off. 473 sleepConfNoMetrics := *te.sleepConf 474 sleepConfNoMetrics.MetricServer = "" 475 noMetricsCont, err := New(&sleepConfNoMetrics, Args{ 476 ID: testutil.RandomContainerID(), 477 Spec: te.sleepSpec, 478 BundleDir: te.bundleDir, 479 }) 480 if err != nil { 481 t.Fatalf("error creating no-metrics container: %v", err) 482 } 483 defer noMetricsCont.Destroy() 484 485 // Verify that the metrics server says what we expect. 486 gotData, err := te.client.GetMetrics(te.testCtx, nil) 487 if err != nil { 488 t.Fatalf("Cannot get metrics after starting containers: %v", err) 489 } 490 t.Logf("Metrics after starting all containers:\n\n%s\n\n", gotData) 491 for _, container := range containers { 492 if _, _, err := gotData.GetPrometheusContainerInteger(metricclient.WantMetric{ 493 Metric: "testmetric_fs_opens", 494 Sandbox: container.ID, 495 }); err != nil { 496 t.Errorf("Cannot get testmetric_fs_opens for container %s: %v", container.ID, err) 497 } 498 } 499 if val, _, err := gotData.GetPrometheusContainerInteger(metricclient.WantMetric{ 500 Metric: "testmetric_fs_opens", 501 Sandbox: noMetricsCont.ID, 502 }); err == nil { 503 t.Errorf("Unexpectedly found testmetric_fs_opens metric data for no-metrics container %s: %v", noMetricsCont.ID, val) 504 } 505 506 // Stop every other container. 507 for container := range toDestroy { 508 if err := container.Destroy(); err != nil { 509 t.Logf("Warning: cannot destroy container %s: %v", container.ID, err) 510 continue 511 } 512 delete(needCleanup, container) 513 } 514 515 // Verify that now we only have half the containers. 516 gotData, err = te.client.GetMetrics(te.testCtx, nil) 517 if err != nil { 518 t.Fatalf("Cannot get metrics after stopping half the containers: %v", err) 519 } 520 t.Logf("Metrics after stopping half the containers:\n\n%s\n\n", gotData) 521 for _, container := range containers { 522 val, _, err := gotData.GetPrometheusContainerInteger(metricclient.WantMetric{ 523 Metric: "testmetric_fs_opens", 524 Sandbox: container.ID, 525 }) 526 _, wantErr := toDestroy[container] 527 if gotErr := err != nil; gotErr && !wantErr { 528 t.Errorf("Wanted to find data for container %s but didn't: %v", container.ID, err) 529 } else if !gotErr && wantErr { 530 t.Errorf("Wanted to find no data for container %s but found this value instead: %v", container.ID, val) 531 } 532 } 533 if val, _, err := gotData.GetPrometheusContainerInteger(metricclient.WantMetric{ 534 Metric: "testmetric_fs_opens", 535 Sandbox: noMetricsCont.ID, 536 }); err == nil { 537 t.Errorf("Unexpectedly found testmetric_fs_opens metric data for no-metrics container %s: %v", noMetricsCont.ID, val) 538 } 539 } 540 541 // TestContainerMetricsFilter verifies the ability to filter metrics in /metrics requests. 542 func TestContainerMetricsFilter(t *testing.T) { 543 te, cleanup := setupMetrics(t, false /* forceTempUDS */) 544 defer cleanup() 545 546 args := Args{ 547 ID: testutil.RandomContainerID(), 548 Spec: te.sleepSpec, 549 BundleDir: te.bundleDir, 550 } 551 cont, err := New(te.sleepConf, args) 552 if err != nil { 553 t.Fatalf("error creating container: %v", err) 554 } 555 defer cont.Destroy() 556 if err := cont.Start(te.sleepConf); err != nil { 557 t.Fatalf("Cannot start container: %v", err) 558 } 559 560 // First pass: Unfiltered data. 561 unfilteredData, err := te.client.GetMetrics(te.testCtx, nil) 562 if err != nil { 563 t.Fatalf("Cannot get metrics: %v", err) 564 } 565 _, _, err = unfilteredData.GetPrometheusContainerInteger(metricclient.WantMetric{ 566 Metric: "testmetric_fs_opens", 567 Sandbox: args.ID, 568 }) 569 if err != nil { 570 t.Errorf("Cannot get testmetric_fs_opens: %v", err) 571 } 572 _, err = unfilteredData.GetSandboxMetadataMetric(metricclient.WantMetric{ 573 Metric: "testmetric_meta_sandbox_metadata", 574 Sandbox: args.ID, 575 }) 576 if err != nil { 577 t.Errorf("Cannot get sandbox metadata: %v", err) 578 } 579 580 // Second pass: Filter such that fs_opens does not match. 581 filteredData, err := te.client.GetMetrics(te.testCtx, map[string]string{ 582 "runsc-sandbox-metrics-filter": "^$", // Matches nothing. 583 }) 584 if err != nil { 585 t.Fatalf("Cannot get metrics: %v", err) 586 } 587 _, _, err = filteredData.GetPrometheusContainerInteger(metricclient.WantMetric{ 588 Metric: "testmetric_fs_opens", 589 Sandbox: args.ID, 590 }) 591 if err == nil { 592 t.Errorf("Was unexpectedly able to get fs_opens data from filtered data:\n\n%v\n\n", filteredData) 593 } 594 _, err = filteredData.GetSandboxMetadataMetric(metricclient.WantMetric{ 595 Metric: "testmetric_meta_sandbox_metadata", 596 Sandbox: args.ID, 597 }) 598 if err != nil { 599 t.Errorf("Cannot get sandbox metadata from filtered data: %v", err) 600 } 601 602 // Third pass: Filter such that fs_opens does match. 603 filteredData2, err := te.client.GetMetrics(te.testCtx, map[string]string{ 604 "runsc-sandbox-metrics-filter": "^fs_.*$", 605 }) 606 if err != nil { 607 t.Fatalf("Cannot get metrics: %v", err) 608 } 609 _, _, err = filteredData2.GetPrometheusContainerInteger(metricclient.WantMetric{ 610 Metric: "testmetric_fs_opens", 611 Sandbox: args.ID, 612 }) 613 if err != nil { 614 t.Errorf("Cannot get testmetric_fs_opens from filtered data: %v", err) 615 } 616 _, err = filteredData2.GetSandboxMetadataMetric(metricclient.WantMetric{ 617 Metric: "testmetric_meta_sandbox_metadata", 618 Sandbox: args.ID, 619 }) 620 if err != nil { 621 t.Errorf("Cannot get sandbox metadata from filtered data: %v", err) 622 } 623 624 // Fourth pass: Filter such that fs_opens does not match, then request with no filtering, 625 // to ensure that the filter regex caching is correctly applied. 626 _, err = te.client.GetMetrics(te.testCtx, map[string]string{ 627 "runsc-sandbox-metrics-filter": "^$", 628 }) 629 if err != nil { 630 t.Fatalf("Cannot get metrics: %v", err) 631 } 632 unfilteredData2, err := te.client.GetMetrics(te.testCtx, nil) 633 if err != nil { 634 t.Fatalf("Cannot get metrics: %v", err) 635 } 636 _, _, err = unfilteredData2.GetPrometheusContainerInteger(metricclient.WantMetric{ 637 Metric: "testmetric_fs_opens", 638 Sandbox: args.ID, 639 }) 640 if err != nil { 641 t.Errorf("Cannot get testmetric_fs_opens from unfiltered data: %v", err) 642 } 643 _, err = unfilteredData2.GetSandboxMetadataMetric(metricclient.WantMetric{ 644 Metric: "testmetric_meta_sandbox_metadata", 645 Sandbox: args.ID, 646 }) 647 if err != nil { 648 t.Errorf("Cannot get sandbox metadata from unfiltered data: %v", err) 649 } 650 651 // Fifth pass: Use alternate URL encoding to mimic Prometheus's URL-encoding 652 // behavior. 653 alternatePathData, err := te.client.GetMetrics(te.testCtx, map[string]string{ 654 // Encoded version of "/metrics?runsc-sandbox-metrics-filter=^$", this should match nothing. 655 "": "/metrics%3Frunsc-sandbox-metrics-filter=%5E%24", 656 }) 657 if err != nil { 658 t.Fatalf("Cannot get metrics: %v", err) 659 } 660 _, err = alternatePathData.GetSandboxMetadataMetric(metricclient.WantMetric{ 661 Metric: "testmetric_meta_sandbox_metadata", 662 Sandbox: args.ID, 663 }) 664 if err != nil { 665 t.Errorf("Cannot get sandbox metadata from data obtained from alternate path: %v\n\nData:\n\n%v\n\n", err, alternatePathData) 666 } 667 _, _, err = alternatePathData.GetPrometheusContainerInteger(metricclient.WantMetric{ 668 Metric: "testmetric_fs_opens", 669 Sandbox: args.ID, 670 }) 671 if err == nil { 672 t.Errorf("Was unexpectedly able to get testmetric_fs_opens from data obtained from alternate path which was supposed to filter it out:\n\n%v\n\n", alternatePathData) 673 } 674 } 675 676 // TestContainerCapabilityFilter verifies the ability to filter capabilities in /metrics requests. 677 func TestContainerCapabilityFilter(t *testing.T) { 678 te, cleanup := setupMetrics(t, false /* forceTempUDS */) 679 defer cleanup() 680 te.sleepSpec.Process.Capabilities.Bounding = append( 681 te.sleepSpec.Process.Capabilities.Bounding, 682 linux.CAP_SYS_NICE.String(), 683 linux.CAP_NET_RAW.String()) 684 685 args := Args{ 686 ID: testutil.RandomContainerID(), 687 Spec: te.sleepSpec, 688 BundleDir: te.bundleDir, 689 } 690 cont, err := New(te.sleepConf, args) 691 if err != nil { 692 t.Fatalf("error creating container: %v", err) 693 } 694 defer cont.Destroy() 695 if err := cont.Start(te.sleepConf); err != nil { 696 t.Fatalf("Cannot start container: %v", err) 697 } 698 699 for _, test := range []struct { 700 name string 701 filter string 702 want map[linux.Capability]bool 703 }{ 704 { 705 name: "unfiltered", 706 filter: "", 707 want: map[linux.Capability]bool{linux.CAP_SYS_NICE: true, linux.CAP_NET_RAW: true}, 708 }, 709 { 710 name: "all filtered out", 711 filter: "^$", 712 want: map[linux.Capability]bool{linux.CAP_SYS_NICE: false, linux.CAP_NET_RAW: false}, 713 }, 714 { 715 name: "simple filter with prefix", 716 filter: fmt.Sprintf("^%s$", linux.CAP_SYS_NICE.String()), 717 want: map[linux.Capability]bool{linux.CAP_SYS_NICE: true, linux.CAP_NET_RAW: false}, 718 }, 719 { 720 name: "simple filter without prefix", 721 filter: fmt.Sprintf("^%s$", linux.CAP_SYS_NICE.TrimmedString()), 722 want: map[linux.Capability]bool{linux.CAP_SYS_NICE: true, linux.CAP_NET_RAW: false}, 723 }, 724 { 725 name: "unfiltered again to test regexp caching", 726 filter: "", 727 want: map[linux.Capability]bool{linux.CAP_SYS_NICE: true, linux.CAP_NET_RAW: true}, 728 }, 729 } { 730 t.Run(test.name, func(t *testing.T) { 731 var params map[string]string 732 if test.filter != "" { 733 params = map[string]string{ 734 "runsc-capability-filter": test.filter, 735 } 736 } 737 data, err := te.client.GetMetrics(te.testCtx, params) 738 if err != nil { 739 t.Fatalf("Cannot get metrics: %v", err) 740 } 741 for cap, want := range test.want { 742 got, _, err := data.GetPrometheusContainerInteger(metricclient.WantMetric{ 743 Metric: "testmetric_meta_sandbox_capabilities", 744 Sandbox: args.ID, 745 ExtraLabels: map[string]string{"capability": cap.TrimmedString()}, 746 }) 747 if err != nil && want { 748 t.Errorf("Cannot get testmetric_meta_sandbox_capabilities[capability=%q]: %v", cap.TrimmedString(), err) 749 } else if err == nil && !want { 750 t.Errorf("Unexpectedly able to get testmetric_meta_sandbox_capabilities[capability=%q]: %v", cap.TrimmedString(), got) 751 } 752 } 753 if t.Failed() { 754 t.Logf("Metric data:\n\n%s\n\n", data) 755 } 756 }) 757 } 758 } 759 760 func TestMetricServerChecksRootDirectoryAccess(t *testing.T) { 761 te, cleanup := setupMetrics(t /* forceTempUDS= */, false) 762 defer cleanup() 763 if err := te.client.ShutdownServer(te.testCtx); err != nil { 764 t.Fatalf("Cannot stop metric server: %v", err) 765 } 766 prevStat, err := os.Lstat(te.sleepConf.RootDir) 767 if err != nil { 768 t.Fatalf("cannot stat %q: %v", te.sleepConf.RootDir, err) 769 } 770 if err := os.Chmod(te.sleepConf.RootDir, 0); err != nil { 771 t.Fatalf("cannot chmod %q as 000: %v", te.sleepConf.RootDir, err) 772 } 773 defer os.Chmod(te.sleepConf.RootDir, prevStat.Mode()) 774 if _, err := ioutil.ReadDir(te.sleepConf.RootDir); err == nil { 775 t.Logf("Can still read directory %v despite chmodding it to 0. Maybe we are running as root? Skipping test.", te.sleepConf.RootDir) 776 return 777 } 778 shorterCtx, shorterCtxCancel := context.WithTimeout(te.testCtx, time.Second) 779 defer shorterCtxCancel() 780 if err := te.client.SpawnServer(shorterCtx, te.sleepConf, te.serverExtraArgs...); err == nil { 781 t.Error("Metric server was successfully able to be spawned despite not having access to the root directory") 782 } 783 } 784 785 func TestMetricServerToleratesNoRootDirectory(t *testing.T) { 786 te, cleanup := setupMetrics(t /* forceTempUDS= */, true) 787 defer cleanup() 788 if err := te.client.ShutdownServer(te.testCtx); err != nil { 789 t.Fatalf("Cannot stop metric server: %v", err) 790 } 791 if err := os.RemoveAll(te.sleepConf.RootDir); err != nil { 792 t.Fatalf("cannot remove root directory %q: %v", te.sleepConf.RootDir, err) 793 } 794 shortCtx, shortCtxCancel := context.WithTimeout(te.testCtx, time.Second) 795 defer shortCtxCancel() 796 if err := te.client.SpawnServer(shortCtx, te.sleepConf, append([]string{"--allow-unknown-root=false"}, te.serverExtraArgs...)...); err == nil { 797 t.Fatalf("Metric server was successfully able to be spawned despite a non-existent root directory") 798 } 799 if err := te.client.SpawnServer(te.testCtx, te.sleepConf, append([]string{"--allow-unknown-root=true"}, te.serverExtraArgs...)...); err != nil { 800 t.Errorf("Metric server was not able to be spawned despite being configured to tolerate a non-existent root directory: %v", err) 801 } 802 } 803 804 func TestMetricServerDoesNotExportZeroValueCounters(t *testing.T) { 805 te, cleanup := setupMetrics(t, false /* forceTempUDS */) 806 defer cleanup() 807 app, err := testutil.FindFile("test/cmd/test_app/test_app") 808 if err != nil { 809 t.Fatalf("error finding test_app: %v", err) 810 } 811 unimpl1Spec := testutil.NewSpecWithArgs("sh", "-c", fmt.Sprintf("%s syscall --syscall=1337; sleep 1h", app)) 812 unimpl1Conf := te.applyConf(testutil.TestConfig(t)) 813 unimpl1Bundle, cleanup, err := testutil.SetupBundleDir(unimpl1Spec) 814 if err != nil { 815 t.Fatalf("error setting up container: %v", err) 816 } 817 defer cleanup() 818 unimpl2Spec := testutil.NewSpecWithArgs("sh", "-c", fmt.Sprintf("%s syscall --syscall=1338; sleep 1h", app)) 819 unimpl2Conf := te.applyConf(testutil.TestConfig(t)) 820 unimpl2Bundle, cleanup, err := testutil.SetupBundleDir(unimpl2Spec) 821 if err != nil { 822 t.Fatalf("error setting up container: %v", err) 823 } 824 defer cleanup() 825 unimpl1, err := New(unimpl1Conf, Args{ 826 ID: testutil.RandomContainerID(), 827 Spec: unimpl1Spec, 828 BundleDir: unimpl1Bundle, 829 }) 830 if err != nil { 831 t.Fatalf("error creating first container: %v", err) 832 } 833 defer unimpl1.Destroy() 834 if err := unimpl1.Start(unimpl1Conf); err != nil { 835 t.Fatalf("Cannot start first container: %v", err) 836 } 837 unimpl2, err := New(unimpl2Conf, Args{ 838 ID: testutil.RandomContainerID(), 839 Spec: unimpl2Spec, 840 BundleDir: unimpl2Bundle, 841 }) 842 if err != nil { 843 t.Fatalf("error creating second container: %v", err) 844 } 845 defer unimpl2.Destroy() 846 if err := unimpl2.Start(unimpl2Conf); err != nil { 847 t.Fatalf("Cannot start second container: %v", err) 848 } 849 metricData, err := te.client.GetMetrics(te.testCtx, nil) 850 if err != nil { 851 t.Fatalf("Cannot get metrics: %v", err) 852 } 853 metricDataPtr := &metricData 854 855 // For this test to work, it must wait for long enough such that the containers have 856 // actually tried to call the unimplemented syscall so that it shows up in metrics. 857 waitCtx, waitCtxCancel := context.WithTimeout(te.testCtx, 50*time.Second) 858 defer waitCtxCancel() 859 860 for _, test := range []struct { 861 cont *Container 862 sysno uintptr 863 wantExistence bool 864 }{ 865 {unimpl1, 1337, true}, 866 {unimpl1, 1338, false}, 867 {unimpl2, 1337, false}, 868 {unimpl2, 1338, true}, 869 } { 870 t.Run(fmt.Sprintf("container %s syscall %d", test.cont.ID, test.sysno), func(t *testing.T) { 871 check := func() error { 872 got, _, err := metricDataPtr.GetPrometheusContainerInteger(metricclient.WantMetric{ 873 Metric: "testmetric_unimplemented_syscalls", 874 Sandbox: test.cont.sandboxID(), 875 ExtraLabels: map[string]string{"sysno": strconv.Itoa(int(test.sysno))}, 876 }) 877 if test.wantExistence { 878 if err != nil { 879 return fmt.Errorf("cannot get unimplemented syscall metric for sysno=%d even though we expected its presence: %v", test.sysno, err) 880 } 881 if got != 1 { 882 return fmt.Errorf("expected counter value for unimplemented syscall %d be exactly 1, got %d", test.sysno, got) 883 } 884 } else /* !test.wantExistence */ { 885 if err == nil { 886 return fmt.Errorf("unimplemented syscall metric for sysno=%d was unexpectedly present (value: %d)", test.sysno, got) 887 } 888 } 889 return nil 890 } 891 for waitCtx.Err() == nil { 892 if check() == nil { 893 break 894 } 895 select { 896 case <-time.After(20 * time.Millisecond): 897 newMetricData, err := te.client.GetMetrics(te.testCtx, nil) 898 if err != nil { 899 t.Fatalf("Cannot get metrics: %v", err) 900 } 901 *metricDataPtr = newMetricData 902 case <-waitCtx.Done(): 903 } 904 } 905 if err := check(); err != nil { 906 t.Error(err.Error()) 907 } 908 }) 909 } 910 if t.Failed() { 911 t.Logf("Last metric data:\n\n%s\n\n", metricData) 912 } 913 }