gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/container/metric_server_test.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package container
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"io/ioutil"
    21  	"os"
    22  	"path/filepath"
    23  	"strconv"
    24  	"strings"
    25  	"testing"
    26  	"time"
    27  
    28  	"github.com/google/go-cmp/cmp"
    29  	specs "github.com/opencontainers/runtime-spec/specs-go"
    30  	"gvisor.dev/gvisor/pkg/abi/linux"
    31  	"gvisor.dev/gvisor/pkg/cleanup"
    32  	"gvisor.dev/gvisor/pkg/test/testutil"
    33  	"gvisor.dev/gvisor/runsc/config"
    34  	"gvisor.dev/gvisor/test/metricclient"
    35  )
    36  
    37  const (
    38  	// podAnnotation contains the name of the pod that a sandbox represents when running in
    39  	// Kubernetes.
    40  	podAnnotation = "io.kubernetes.cri.sandbox-name"
    41  	// namespaceAnnotation contains the name of the namespace that a sandbox is in when running in
    42  	// Kubernetes.
    43  	namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace"
    44  )
    45  
    46  // metricsTest is returned by setupMetrics.
    47  type metricsTest struct {
    48  	testCtx         context.Context
    49  	rootDir         string
    50  	bundleDir       string
    51  	sleepSpec       *specs.Spec
    52  	sleepConf       *config.Config
    53  	udsPath         string
    54  	client          *metricclient.MetricClient
    55  	serverExtraArgs []string
    56  }
    57  
    58  // applyConf applies metric-server-related configuration options to the given config.
    59  // Returns the passed-in config itself.
    60  func (mt *metricsTest) applyConf(conf *config.Config) *config.Config {
    61  	conf.MetricServer = mt.sleepConf.MetricServer
    62  	conf.RootDir = mt.rootDir
    63  	return conf
    64  }
    65  
    66  // setupMetrics sets up a container configuration with metrics enabled, and returns it all.
    67  // Also returns a cleanup function.
    68  func setupMetrics(t *testing.T, forceTempUDS bool) (*metricsTest, func()) {
    69  	// Start the child reaper.
    70  	childReaper := &testutil.Reaper{}
    71  	childReaper.Start()
    72  	cu := cleanup.Make(childReaper.Stop)
    73  
    74  	cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 3*time.Minute+30*time.Second)
    75  	cu.Add(cleanupCancel)
    76  	testCtx, testCancel := context.WithTimeout(cleanupCtx, 3*time.Minute)
    77  	cu.Add(testCancel)
    78  
    79  	spec, conf := sleepSpecConf(t)
    80  	conf.MetricServer = "%RUNTIME_ROOT%/metrics.sock"
    81  	serverExtraArgs := []string{"--exporter-prefix=testmetric_"}
    82  	rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
    83  	if err != nil {
    84  		t.Fatalf("error setting up container: %v", err)
    85  	}
    86  	cu.Add(cleanup)
    87  	tmpDir, err := os.MkdirTemp("/tmp", "metrics-")
    88  	if err != nil {
    89  		t.Fatalf("Cannot create temporary directory in /tmp: %v", err)
    90  	}
    91  	cu.Add(func() { os.RemoveAll(tmpDir) })
    92  	udsPath := filepath.Join(rootDir, "metrics.sock")
    93  	if forceTempUDS || len(udsPath) >= 100 {
    94  		udsPath = filepath.Join(tmpDir, "metrics.sock")
    95  	}
    96  	if len(udsPath) >= 100 {
    97  		t.Fatalf("Cannot come up with a UDS path shorter than the maximum length allowed by Linux (tried to use %q)", udsPath)
    98  	}
    99  	conf.MetricServer = udsPath
   100  	// The UDS should be deleted by the metrics server itself, but we clean it up here anyway just in case:
   101  	cu.Add(func() { os.Remove(udsPath) })
   102  
   103  	metricClient := metricclient.NewMetricClient(udsPath, rootDir)
   104  	if err := metricClient.SpawnServer(testCtx, conf, serverExtraArgs...); err != nil {
   105  		t.Fatalf("Cannot start metric server: %v", err)
   106  	}
   107  	cu.Add(func() { metricClient.ShutdownServer(cleanupCtx) })
   108  
   109  	return &metricsTest{
   110  		testCtx:         testCtx,
   111  		rootDir:         rootDir,
   112  		bundleDir:       bundleDir,
   113  		sleepSpec:       spec,
   114  		sleepConf:       conf,
   115  		udsPath:         udsPath,
   116  		client:          metricClient,
   117  		serverExtraArgs: serverExtraArgs,
   118  	}, cu.Clean
   119  }
   120  
   121  // TestContainerMetrics verifies basic functionality of the metric server works.
   122  func TestContainerMetrics(t *testing.T) {
   123  	targetOpens := 200
   124  
   125  	te, cleanup := setupMetrics(t /* forceTempUDS= */, false)
   126  	defer cleanup()
   127  
   128  	if _, err := te.client.GetMetrics(te.testCtx, nil); err != nil {
   129  		t.Fatal("GetMetrics failed prior to container start")
   130  	}
   131  	if te.sleepSpec.Annotations == nil {
   132  		te.sleepSpec.Annotations = make(map[string]string)
   133  	}
   134  	te.sleepSpec.Annotations[podAnnotation] = "foopod"
   135  	te.sleepSpec.Annotations[namespaceAnnotation] = "foons"
   136  	args := Args{
   137  		ID:        testutil.RandomContainerID(),
   138  		Spec:      te.sleepSpec,
   139  		BundleDir: te.bundleDir,
   140  	}
   141  	cont, err := New(te.sleepConf, args)
   142  	if err != nil {
   143  		t.Fatalf("error creating container: %v", err)
   144  	}
   145  	defer cont.Destroy()
   146  	udsStat, udsStatErr := os.Stat(te.udsPath)
   147  	if udsStatErr != nil {
   148  		t.Fatalf("Stat(%s) failed after creating container: %v", te.udsPath, udsStatErr)
   149  	}
   150  	if udsStat.Mode()&os.ModeSocket == 0 {
   151  		t.Errorf("Stat(%s): Got mode %x, expected socket (mode %x)", te.udsPath, udsStat.Mode(), os.ModeSocket)
   152  	}
   153  	initialData, err := te.client.GetMetrics(te.testCtx, nil)
   154  	if err != nil {
   155  		t.Errorf("Cannot get metrics after creating container: %v", err)
   156  	}
   157  	gotSandboxMetadata, err := initialData.GetSandboxMetadataMetric(metricclient.WantMetric{
   158  		Metric:    "testmetric_meta_sandbox_metadata",
   159  		Sandbox:   args.ID,
   160  		Pod:       "foopod",
   161  		Namespace: "foons",
   162  	})
   163  	if err != nil {
   164  		t.Errorf("Cannot get sandbox metadata: %v", err)
   165  	}
   166  	if gotSandboxMetadata["platform"] == "" || gotSandboxMetadata["platform"] != te.sleepConf.Platform {
   167  		t.Errorf("Invalid platform: Metric metadata says %v, config says %v", gotSandboxMetadata["platform"], te.sleepConf.Platform)
   168  	}
   169  	gotSpecMetadata, err := initialData.GetSandboxMetadataMetric(metricclient.WantMetric{
   170  		Metric:    "testmetric_meta_spec_metadata",
   171  		Sandbox:   args.ID,
   172  		Pod:       "foopod",
   173  		Namespace: "foons",
   174  	})
   175  	if err != nil {
   176  		t.Errorf("Cannot get spec metadata: %v", err)
   177  	}
   178  	if gotSpecMetadata["hasuid0"] == "" || (gotSpecMetadata["hasuid0"] != "true" && gotSpecMetadata["hasuid0"] != "false") {
   179  		t.Errorf("Invalid or absent hasuid0 key from spec metadata: %v", gotSpecMetadata["hasuid0"])
   180  	}
   181  	t.Logf("Metrics prior to container start:\n\n%s\n\n", initialData)
   182  	if err := cont.Start(te.sleepConf); err != nil {
   183  		t.Fatalf("Cannot start container: %v", err)
   184  	}
   185  	postStartData, err := te.client.GetMetrics(te.testCtx, nil)
   186  	if err != nil {
   187  		t.Fatalf("Cannot get metrics after starting container: %v", err)
   188  	}
   189  	postStartOpens, postStartTimestamp, err := postStartData.GetPrometheusContainerInteger(metricclient.WantMetric{
   190  		Metric:    "testmetric_fs_opens",
   191  		Sandbox:   args.ID,
   192  		Pod:       "foopod",
   193  		Namespace: "foons",
   194  	})
   195  	if err != nil {
   196  		t.Errorf("Cannot get testmetric_fs_opens from following data (err: %v):\n\n%s\n\n", err, postStartData)
   197  	}
   198  	t.Logf("After container start, fs_opens=%d (snapshotted at %v)", postStartOpens, postStartTimestamp)
   199  	// The touch operation may fail from permission errors, but the metric should still be incremented.
   200  	shOutput, err := executeCombinedOutput(te.sleepConf, cont, nil, "/bin/bash", "-c", fmt.Sprintf("for i in $(seq 1 %d); do touch /tmp/$i || true; done", targetOpens))
   201  	if err != nil {
   202  		t.Fatalf("Exec failed: %v; output: %v", err, shOutput)
   203  	}
   204  	postExecData, err := te.client.GetMetrics(te.testCtx, nil)
   205  	if err != nil {
   206  		t.Fatalf("Cannot get metrics after a bunch of open() calls: %v", err)
   207  	}
   208  	postExecOpens, postExecTimestamp, err := postExecData.GetPrometheusContainerInteger(metricclient.WantMetric{
   209  		Metric:    "testmetric_fs_opens",
   210  		Sandbox:   args.ID,
   211  		Pod:       "foopod",
   212  		Namespace: "foons",
   213  	})
   214  	if err != nil {
   215  		t.Errorf("Cannot get testmetric_fs_opens from following data (err: %v):\n\n%s\n\n", err, postExecData)
   216  	}
   217  	t.Logf("After exec'ing %d open()s, fs_opens=%d (snapshotted at %v)", targetOpens, postExecOpens, postExecTimestamp)
   218  	diffOpens := postExecOpens - postStartOpens
   219  	if diffOpens < int64(targetOpens) {
   220  		t.Errorf("testmetric_fs_opens went from %d to %d (diff: %d), expected the difference to be at least %d", postStartOpens, postExecOpens, diffOpens, targetOpens)
   221  	}
   222  }
   223  
   224  // TestContainerMetricsIterationID verifies that two successive containers with the same ID
   225  // do not have the same iteration ID.
   226  func TestContainerMetricsIterationID(t *testing.T) {
   227  	te, cleanup := setupMetrics(t /* forceTempUDS= */, false)
   228  	defer cleanup()
   229  
   230  	args := Args{
   231  		ID:        testutil.RandomContainerID(),
   232  		Spec:      te.sleepSpec,
   233  		BundleDir: te.bundleDir,
   234  	}
   235  	cont1, err := New(te.sleepConf, args)
   236  	if err != nil {
   237  		t.Fatalf("error creating container 1: %v", err)
   238  	}
   239  	defer cont1.Destroy()
   240  	data1, err := te.client.GetMetrics(te.testCtx, nil)
   241  	if err != nil {
   242  		t.Errorf("Cannot get metrics after creating container 1: %v", err)
   243  	}
   244  	metadata1, err := data1.GetSandboxMetadataMetric(metricclient.WantMetric{
   245  		Metric:  "testmetric_meta_sandbox_metadata",
   246  		Sandbox: args.ID,
   247  	})
   248  	if err != nil {
   249  		t.Errorf("Cannot get sandbox 1 metadata: %v", err)
   250  	}
   251  	t.Logf("Container 1 metadata: %v", metadata1)
   252  	iterationID1 := metadata1["iteration"]
   253  	if iterationID1 == "" {
   254  		t.Fatalf("Cannot find iteration ID in metadata 1: %v", metadata1)
   255  	}
   256  	if err := cont1.Destroy(); err != nil && !strings.Contains(err.Error(), "no child process") {
   257  		t.Fatalf("Cannot destroy container 1: %v", err)
   258  	}
   259  	cont2, err := New(te.sleepConf, args)
   260  	if err != nil {
   261  		t.Fatalf("error creating container 2: %v", err)
   262  	}
   263  	defer cont2.Destroy()
   264  	data2, err := te.client.GetMetrics(te.testCtx, nil)
   265  	if err != nil {
   266  		t.Errorf("Cannot get metrics after creating container 2: %v", err)
   267  	}
   268  	metadata2, err := data2.GetSandboxMetadataMetric(metricclient.WantMetric{
   269  		Metric:  "testmetric_meta_sandbox_metadata",
   270  		Sandbox: args.ID,
   271  	})
   272  	if err != nil {
   273  		t.Errorf("Cannot get sandbox 2 metadata: %v", err)
   274  	}
   275  	t.Logf("Container 2 metadata: %v", metadata2)
   276  	iterationID2 := metadata2["iteration"]
   277  	if iterationID2 == "" {
   278  		t.Fatalf("Cannot find iteration ID in metadata 2: %v", metadata2)
   279  	}
   280  	if iterationID1 == iterationID2 {
   281  		t.Errorf("Iteration IDs of successive instances with the same ID unexpectedly matched: %v", iterationID1)
   282  	}
   283  }
   284  
   285  // TestContainerMetricsRobustAgainstRestarts that exporting metrics is robust against metric server
   286  // unavailability or restarts.
   287  func TestContainerMetricsRobustAgainstRestarts(t *testing.T) {
   288  	targetOpens := 200
   289  	te, cleanup := setupMetrics(t /* forceTempUDS= */, false)
   290  	defer cleanup()
   291  
   292  	// First, start a container which will kick off the metric server as normal.
   293  	args := Args{
   294  		ID:        testutil.RandomContainerID(),
   295  		Spec:      te.sleepSpec,
   296  		BundleDir: te.bundleDir,
   297  	}
   298  	cont, err := New(te.sleepConf, args)
   299  	if err != nil {
   300  		t.Fatalf("error creating container: %v", err)
   301  	}
   302  	defer cont.Destroy()
   303  	if err := cont.Start(te.sleepConf); err != nil {
   304  		t.Fatalf("Cannot start container: %v", err)
   305  	}
   306  	shOutput, err := executeCombinedOutput(te.sleepConf, cont, nil, "/bin/bash", "-c", fmt.Sprintf("for i in $(seq 1 %d); do touch /tmp/$i || true; done", targetOpens))
   307  	if err != nil {
   308  		t.Fatalf("Exec failed: %v; output: %v", err, shOutput)
   309  	}
   310  	preRestartData, err := te.client.GetMetrics(te.testCtx, nil)
   311  	if err != nil {
   312  		t.Fatalf("Cannot get metrics after a bunch of open() calls: %v", err)
   313  	}
   314  
   315  	// Retain the value of fs_opens for the first container. We'll use it when comparing to the data
   316  	// from the restarted metric server.
   317  	preRestartOpens, postExecTimestamp, err := preRestartData.GetPrometheusContainerInteger(metricclient.WantMetric{
   318  		Metric:  "testmetric_fs_opens",
   319  		Sandbox: args.ID,
   320  	})
   321  	if err != nil {
   322  		t.Errorf("Cannot get testmetric_fs_opens from following data (err: %v):\n\n%s\n\n", err, preRestartData)
   323  	}
   324  	preRestartMetadata, err := preRestartData.GetSandboxMetadataMetric(metricclient.WantMetric{
   325  		Metric:  "testmetric_meta_sandbox_metadata",
   326  		Sandbox: args.ID,
   327  	})
   328  	if err != nil {
   329  		t.Errorf("Cannot get sandbox metadata: %v", err)
   330  	}
   331  	t.Logf("After exec'ing %d open()s, fs_opens=%d (snapshotted at %v)", targetOpens, preRestartOpens, postExecTimestamp)
   332  
   333  	// Now shut down the metric server and verify we can no longer fetch metrics.
   334  	if err := te.client.ShutdownServer(te.testCtx); err != nil {
   335  		t.Fatalf("Cannot shutdown server: %v", err)
   336  	}
   337  	if rawData, err := te.client.GetMetrics(te.testCtx, nil); err == nil {
   338  		t.Fatalf("Unexpectedly was able to get metric data despite shutting down server:\n\n%s\n\n", rawData)
   339  	}
   340  
   341  	// Do a bunch of touches again. The metric server is down during this time.
   342  	// This verifies that metric value modifications does not depend on the metric server being up.
   343  	shOutput, err = executeCombinedOutput(te.sleepConf, cont, nil, "/bin/bash", "-c", fmt.Sprintf("for i in $(seq 1 %d); do touch /tmp/$i || true; done", targetOpens))
   344  	if err != nil {
   345  		t.Fatalf("Exec failed: %v; output: %v", err, shOutput)
   346  	}
   347  
   348  	// Start a second container.
   349  	// This container should be picked up by a metric server we will start afterwards.
   350  	// This verifies that a metric server being down does not cause sandbox creation to fail.
   351  	args2 := Args{
   352  		ID:        testutil.RandomContainerID(),
   353  		Spec:      te.sleepSpec,
   354  		BundleDir: te.bundleDir,
   355  	}
   356  	cont2, err := New(te.sleepConf, args2)
   357  	if err != nil {
   358  		t.Fatalf("error creating second container: %v", err)
   359  	}
   360  	defer cont2.Destroy()
   361  	if rawData, err := te.client.GetMetrics(te.testCtx, nil); err == nil {
   362  		t.Fatalf("Unexpectedly was able to get metric data after creating second container:\n\n%s\n\n", rawData)
   363  	}
   364  	if err := cont2.Start(te.sleepConf); err != nil {
   365  		t.Fatalf("Cannot start second container: %v", err)
   366  	}
   367  	if rawData, err := te.client.GetMetrics(te.testCtx, nil); err == nil {
   368  		t.Fatalf("Unexpectedly was able to get metric data after starting second container:\n\n%s\n\n", rawData)
   369  	}
   370  
   371  	// Start the metric server.
   372  	if err := te.client.SpawnServer(te.testCtx, te.sleepConf, te.serverExtraArgs...); err != nil {
   373  		t.Fatalf("Cannot re-spawn server: %v", err)
   374  	}
   375  
   376  	// Now start a third container.
   377  	// This should be picked up by the server we just started.
   378  	args3 := Args{
   379  		ID:        testutil.RandomContainerID(),
   380  		Spec:      te.sleepSpec,
   381  		BundleDir: te.bundleDir,
   382  	}
   383  	cont3, err := New(te.sleepConf, args3)
   384  	if err != nil {
   385  		t.Fatalf("error creating second container: %v", err)
   386  	}
   387  	defer cont3.Destroy()
   388  	if err := cont3.Start(te.sleepConf); err != nil {
   389  		t.Fatalf("Cannot start third container: %v", err)
   390  	}
   391  
   392  	// Verify that the metric server was restarted and that we can indeed get all the data we expect
   393  	// from all the containers this test has started.
   394  	postRestartData, err := te.client.GetMetrics(te.testCtx, nil)
   395  	if err != nil {
   396  		t.Fatalf("Cannot get metrics after restarting server: %v", err)
   397  	}
   398  	postRestartOpens, _, err := postRestartData.GetPrometheusContainerInteger(metricclient.WantMetric{
   399  		Metric:  "testmetric_fs_opens",
   400  		Sandbox: args.ID,
   401  	})
   402  	if err != nil {
   403  		t.Fatalf("Cannot get testmetric_fs_opens for first container (%s) from following data (err: %v):\n\n%s\n\n", args.ID, err, postRestartData)
   404  	}
   405  	if diff := postRestartOpens - preRestartOpens; diff < int64(targetOpens) {
   406  		t.Errorf("testmetric_fs_opens for first container did not increase by at least %d after metric server restart: went from %d to %d (diff: %d)", targetOpens, preRestartOpens, postRestartOpens, diff)
   407  	}
   408  	postRestartMetadata, err := postRestartData.GetSandboxMetadataMetric(metricclient.WantMetric{
   409  		Metric:  "testmetric_meta_sandbox_metadata",
   410  		Sandbox: args.ID,
   411  	})
   412  	if err != nil {
   413  		t.Fatalf("Cannot get post-restart sandbox metadata: %v", err)
   414  	}
   415  	if diff := cmp.Diff(preRestartMetadata, postRestartMetadata); diff != "" {
   416  		t.Errorf("Sandbox metadata changed after restart:\nBefore: %v\nAfter: %v\nDiff: %v", preRestartMetadata, postRestartMetadata, diff)
   417  	}
   418  	_, _, err = postRestartData.GetPrometheusContainerInteger(metricclient.WantMetric{
   419  		Metric:  "testmetric_fs_opens",
   420  		Sandbox: args2.ID,
   421  	})
   422  	if err != nil {
   423  		t.Fatalf("Cannot get testmetric_fs_opens for second container (%s) from following data (err: %v):\n\n%s\n\n", args2.ID, err, postRestartData)
   424  	}
   425  	_, _, err = postRestartData.GetPrometheusContainerInteger(metricclient.WantMetric{
   426  		Metric:  "testmetric_fs_opens",
   427  		Sandbox: args3.ID,
   428  	})
   429  	if err != nil {
   430  		t.Fatalf("Cannot get testmetric_fs_opens for third container (%s) from following data (err: %v):\n\n%s\n\n", args3.ID, err, postRestartData)
   431  	}
   432  }
   433  
   434  // TestContainerMetricsMultiple verifies that the metric server spawned for one container
   435  // serves metrics for all containers, and survives past its initial container's lifetime.
   436  func TestContainerMetricsMultiple(t *testing.T) {
   437  	numConcurrentContainers := 5
   438  
   439  	te, cleanup := setupMetrics(t /* forceTempUDS= */, false)
   440  	defer cleanup()
   441  	var containers []*Container
   442  	needCleanup := map[*Container]struct{}{}
   443  	toDestroy := map[*Container]struct{}{}
   444  	defer func() {
   445  		for container := range needCleanup {
   446  			container.Destroy()
   447  		}
   448  	}()
   449  
   450  	// Start a bunch of containers with metrics.
   451  	for i := 0; i < numConcurrentContainers; i++ {
   452  		cont, err := New(te.sleepConf, Args{
   453  			ID:        testutil.RandomContainerID(),
   454  			Spec:      te.sleepSpec,
   455  			BundleDir: te.bundleDir,
   456  		})
   457  		if err != nil {
   458  			t.Fatalf("error creating container: %v", err)
   459  		}
   460  		containers = append(containers, cont)
   461  		needCleanup[cont] = struct{}{}
   462  		// Note that this includes the first container, which will be the one that
   463  		// starts the metrics server.
   464  		if i%2 == 0 {
   465  			toDestroy[cont] = struct{}{}
   466  		}
   467  		if err := cont.Start(te.sleepConf); err != nil {
   468  			t.Fatalf("Cannot start container: %v", err)
   469  		}
   470  	}
   471  
   472  	// Start one container with metrics turned off.
   473  	sleepConfNoMetrics := *te.sleepConf
   474  	sleepConfNoMetrics.MetricServer = ""
   475  	noMetricsCont, err := New(&sleepConfNoMetrics, Args{
   476  		ID:        testutil.RandomContainerID(),
   477  		Spec:      te.sleepSpec,
   478  		BundleDir: te.bundleDir,
   479  	})
   480  	if err != nil {
   481  		t.Fatalf("error creating no-metrics container: %v", err)
   482  	}
   483  	defer noMetricsCont.Destroy()
   484  
   485  	// Verify that the metrics server says what we expect.
   486  	gotData, err := te.client.GetMetrics(te.testCtx, nil)
   487  	if err != nil {
   488  		t.Fatalf("Cannot get metrics after starting containers: %v", err)
   489  	}
   490  	t.Logf("Metrics after starting all containers:\n\n%s\n\n", gotData)
   491  	for _, container := range containers {
   492  		if _, _, err := gotData.GetPrometheusContainerInteger(metricclient.WantMetric{
   493  			Metric:  "testmetric_fs_opens",
   494  			Sandbox: container.ID,
   495  		}); err != nil {
   496  			t.Errorf("Cannot get testmetric_fs_opens for container %s: %v", container.ID, err)
   497  		}
   498  	}
   499  	if val, _, err := gotData.GetPrometheusContainerInteger(metricclient.WantMetric{
   500  		Metric:  "testmetric_fs_opens",
   501  		Sandbox: noMetricsCont.ID,
   502  	}); err == nil {
   503  		t.Errorf("Unexpectedly found testmetric_fs_opens metric data for no-metrics container %s: %v", noMetricsCont.ID, val)
   504  	}
   505  
   506  	// Stop every other container.
   507  	for container := range toDestroy {
   508  		if err := container.Destroy(); err != nil {
   509  			t.Logf("Warning: cannot destroy container %s: %v", container.ID, err)
   510  			continue
   511  		}
   512  		delete(needCleanup, container)
   513  	}
   514  
   515  	// Verify that now we only have half the containers.
   516  	gotData, err = te.client.GetMetrics(te.testCtx, nil)
   517  	if err != nil {
   518  		t.Fatalf("Cannot get metrics after stopping half the containers: %v", err)
   519  	}
   520  	t.Logf("Metrics after stopping half the containers:\n\n%s\n\n", gotData)
   521  	for _, container := range containers {
   522  		val, _, err := gotData.GetPrometheusContainerInteger(metricclient.WantMetric{
   523  			Metric:  "testmetric_fs_opens",
   524  			Sandbox: container.ID,
   525  		})
   526  		_, wantErr := toDestroy[container]
   527  		if gotErr := err != nil; gotErr && !wantErr {
   528  			t.Errorf("Wanted to find data for container %s but didn't: %v", container.ID, err)
   529  		} else if !gotErr && wantErr {
   530  			t.Errorf("Wanted to find no data for container %s but found this value instead: %v", container.ID, val)
   531  		}
   532  	}
   533  	if val, _, err := gotData.GetPrometheusContainerInteger(metricclient.WantMetric{
   534  		Metric:  "testmetric_fs_opens",
   535  		Sandbox: noMetricsCont.ID,
   536  	}); err == nil {
   537  		t.Errorf("Unexpectedly found testmetric_fs_opens metric data for no-metrics container %s: %v", noMetricsCont.ID, val)
   538  	}
   539  }
   540  
   541  // TestContainerMetricsFilter verifies the ability to filter metrics in /metrics requests.
   542  func TestContainerMetricsFilter(t *testing.T) {
   543  	te, cleanup := setupMetrics(t, false /* forceTempUDS */)
   544  	defer cleanup()
   545  
   546  	args := Args{
   547  		ID:        testutil.RandomContainerID(),
   548  		Spec:      te.sleepSpec,
   549  		BundleDir: te.bundleDir,
   550  	}
   551  	cont, err := New(te.sleepConf, args)
   552  	if err != nil {
   553  		t.Fatalf("error creating container: %v", err)
   554  	}
   555  	defer cont.Destroy()
   556  	if err := cont.Start(te.sleepConf); err != nil {
   557  		t.Fatalf("Cannot start container: %v", err)
   558  	}
   559  
   560  	// First pass: Unfiltered data.
   561  	unfilteredData, err := te.client.GetMetrics(te.testCtx, nil)
   562  	if err != nil {
   563  		t.Fatalf("Cannot get metrics: %v", err)
   564  	}
   565  	_, _, err = unfilteredData.GetPrometheusContainerInteger(metricclient.WantMetric{
   566  		Metric:  "testmetric_fs_opens",
   567  		Sandbox: args.ID,
   568  	})
   569  	if err != nil {
   570  		t.Errorf("Cannot get testmetric_fs_opens: %v", err)
   571  	}
   572  	_, err = unfilteredData.GetSandboxMetadataMetric(metricclient.WantMetric{
   573  		Metric:  "testmetric_meta_sandbox_metadata",
   574  		Sandbox: args.ID,
   575  	})
   576  	if err != nil {
   577  		t.Errorf("Cannot get sandbox metadata: %v", err)
   578  	}
   579  
   580  	// Second pass: Filter such that fs_opens does not match.
   581  	filteredData, err := te.client.GetMetrics(te.testCtx, map[string]string{
   582  		"runsc-sandbox-metrics-filter": "^$", // Matches nothing.
   583  	})
   584  	if err != nil {
   585  		t.Fatalf("Cannot get metrics: %v", err)
   586  	}
   587  	_, _, err = filteredData.GetPrometheusContainerInteger(metricclient.WantMetric{
   588  		Metric:  "testmetric_fs_opens",
   589  		Sandbox: args.ID,
   590  	})
   591  	if err == nil {
   592  		t.Errorf("Was unexpectedly able to get fs_opens data from filtered data:\n\n%v\n\n", filteredData)
   593  	}
   594  	_, err = filteredData.GetSandboxMetadataMetric(metricclient.WantMetric{
   595  		Metric:  "testmetric_meta_sandbox_metadata",
   596  		Sandbox: args.ID,
   597  	})
   598  	if err != nil {
   599  		t.Errorf("Cannot get sandbox metadata from filtered data: %v", err)
   600  	}
   601  
   602  	// Third pass: Filter such that fs_opens does match.
   603  	filteredData2, err := te.client.GetMetrics(te.testCtx, map[string]string{
   604  		"runsc-sandbox-metrics-filter": "^fs_.*$",
   605  	})
   606  	if err != nil {
   607  		t.Fatalf("Cannot get metrics: %v", err)
   608  	}
   609  	_, _, err = filteredData2.GetPrometheusContainerInteger(metricclient.WantMetric{
   610  		Metric:  "testmetric_fs_opens",
   611  		Sandbox: args.ID,
   612  	})
   613  	if err != nil {
   614  		t.Errorf("Cannot get testmetric_fs_opens from filtered data: %v", err)
   615  	}
   616  	_, err = filteredData2.GetSandboxMetadataMetric(metricclient.WantMetric{
   617  		Metric:  "testmetric_meta_sandbox_metadata",
   618  		Sandbox: args.ID,
   619  	})
   620  	if err != nil {
   621  		t.Errorf("Cannot get sandbox metadata from filtered data: %v", err)
   622  	}
   623  
   624  	// Fourth pass: Filter such that fs_opens does not match, then request with no filtering,
   625  	// to ensure that the filter regex caching is correctly applied.
   626  	_, err = te.client.GetMetrics(te.testCtx, map[string]string{
   627  		"runsc-sandbox-metrics-filter": "^$",
   628  	})
   629  	if err != nil {
   630  		t.Fatalf("Cannot get metrics: %v", err)
   631  	}
   632  	unfilteredData2, err := te.client.GetMetrics(te.testCtx, nil)
   633  	if err != nil {
   634  		t.Fatalf("Cannot get metrics: %v", err)
   635  	}
   636  	_, _, err = unfilteredData2.GetPrometheusContainerInteger(metricclient.WantMetric{
   637  		Metric:  "testmetric_fs_opens",
   638  		Sandbox: args.ID,
   639  	})
   640  	if err != nil {
   641  		t.Errorf("Cannot get testmetric_fs_opens from unfiltered data: %v", err)
   642  	}
   643  	_, err = unfilteredData2.GetSandboxMetadataMetric(metricclient.WantMetric{
   644  		Metric:  "testmetric_meta_sandbox_metadata",
   645  		Sandbox: args.ID,
   646  	})
   647  	if err != nil {
   648  		t.Errorf("Cannot get sandbox metadata from unfiltered data: %v", err)
   649  	}
   650  
   651  	// Fifth pass: Use alternate URL encoding to mimic Prometheus's URL-encoding
   652  	// behavior.
   653  	alternatePathData, err := te.client.GetMetrics(te.testCtx, map[string]string{
   654  		// Encoded version of "/metrics?runsc-sandbox-metrics-filter=^$", this should match nothing.
   655  		"": "/metrics%3Frunsc-sandbox-metrics-filter=%5E%24",
   656  	})
   657  	if err != nil {
   658  		t.Fatalf("Cannot get metrics: %v", err)
   659  	}
   660  	_, err = alternatePathData.GetSandboxMetadataMetric(metricclient.WantMetric{
   661  		Metric:  "testmetric_meta_sandbox_metadata",
   662  		Sandbox: args.ID,
   663  	})
   664  	if err != nil {
   665  		t.Errorf("Cannot get sandbox metadata from data obtained from alternate path: %v\n\nData:\n\n%v\n\n", err, alternatePathData)
   666  	}
   667  	_, _, err = alternatePathData.GetPrometheusContainerInteger(metricclient.WantMetric{
   668  		Metric:  "testmetric_fs_opens",
   669  		Sandbox: args.ID,
   670  	})
   671  	if err == nil {
   672  		t.Errorf("Was unexpectedly able to get testmetric_fs_opens from data obtained from alternate path which was supposed to filter it out:\n\n%v\n\n", alternatePathData)
   673  	}
   674  }
   675  
   676  // TestContainerCapabilityFilter verifies the ability to filter capabilities in /metrics requests.
   677  func TestContainerCapabilityFilter(t *testing.T) {
   678  	te, cleanup := setupMetrics(t, false /* forceTempUDS */)
   679  	defer cleanup()
   680  	te.sleepSpec.Process.Capabilities.Bounding = append(
   681  		te.sleepSpec.Process.Capabilities.Bounding,
   682  		linux.CAP_SYS_NICE.String(),
   683  		linux.CAP_NET_RAW.String())
   684  
   685  	args := Args{
   686  		ID:        testutil.RandomContainerID(),
   687  		Spec:      te.sleepSpec,
   688  		BundleDir: te.bundleDir,
   689  	}
   690  	cont, err := New(te.sleepConf, args)
   691  	if err != nil {
   692  		t.Fatalf("error creating container: %v", err)
   693  	}
   694  	defer cont.Destroy()
   695  	if err := cont.Start(te.sleepConf); err != nil {
   696  		t.Fatalf("Cannot start container: %v", err)
   697  	}
   698  
   699  	for _, test := range []struct {
   700  		name   string
   701  		filter string
   702  		want   map[linux.Capability]bool
   703  	}{
   704  		{
   705  			name:   "unfiltered",
   706  			filter: "",
   707  			want:   map[linux.Capability]bool{linux.CAP_SYS_NICE: true, linux.CAP_NET_RAW: true},
   708  		},
   709  		{
   710  			name:   "all filtered out",
   711  			filter: "^$",
   712  			want:   map[linux.Capability]bool{linux.CAP_SYS_NICE: false, linux.CAP_NET_RAW: false},
   713  		},
   714  		{
   715  			name:   "simple filter with prefix",
   716  			filter: fmt.Sprintf("^%s$", linux.CAP_SYS_NICE.String()),
   717  			want:   map[linux.Capability]bool{linux.CAP_SYS_NICE: true, linux.CAP_NET_RAW: false},
   718  		},
   719  		{
   720  			name:   "simple filter without prefix",
   721  			filter: fmt.Sprintf("^%s$", linux.CAP_SYS_NICE.TrimmedString()),
   722  			want:   map[linux.Capability]bool{linux.CAP_SYS_NICE: true, linux.CAP_NET_RAW: false},
   723  		},
   724  		{
   725  			name:   "unfiltered again to test regexp caching",
   726  			filter: "",
   727  			want:   map[linux.Capability]bool{linux.CAP_SYS_NICE: true, linux.CAP_NET_RAW: true},
   728  		},
   729  	} {
   730  		t.Run(test.name, func(t *testing.T) {
   731  			var params map[string]string
   732  			if test.filter != "" {
   733  				params = map[string]string{
   734  					"runsc-capability-filter": test.filter,
   735  				}
   736  			}
   737  			data, err := te.client.GetMetrics(te.testCtx, params)
   738  			if err != nil {
   739  				t.Fatalf("Cannot get metrics: %v", err)
   740  			}
   741  			for cap, want := range test.want {
   742  				got, _, err := data.GetPrometheusContainerInteger(metricclient.WantMetric{
   743  					Metric:      "testmetric_meta_sandbox_capabilities",
   744  					Sandbox:     args.ID,
   745  					ExtraLabels: map[string]string{"capability": cap.TrimmedString()},
   746  				})
   747  				if err != nil && want {
   748  					t.Errorf("Cannot get testmetric_meta_sandbox_capabilities[capability=%q]: %v", cap.TrimmedString(), err)
   749  				} else if err == nil && !want {
   750  					t.Errorf("Unexpectedly able to get testmetric_meta_sandbox_capabilities[capability=%q]: %v", cap.TrimmedString(), got)
   751  				}
   752  			}
   753  			if t.Failed() {
   754  				t.Logf("Metric data:\n\n%s\n\n", data)
   755  			}
   756  		})
   757  	}
   758  }
   759  
   760  func TestMetricServerChecksRootDirectoryAccess(t *testing.T) {
   761  	te, cleanup := setupMetrics(t /* forceTempUDS= */, false)
   762  	defer cleanup()
   763  	if err := te.client.ShutdownServer(te.testCtx); err != nil {
   764  		t.Fatalf("Cannot stop metric server: %v", err)
   765  	}
   766  	prevStat, err := os.Lstat(te.sleepConf.RootDir)
   767  	if err != nil {
   768  		t.Fatalf("cannot stat %q: %v", te.sleepConf.RootDir, err)
   769  	}
   770  	if err := os.Chmod(te.sleepConf.RootDir, 0); err != nil {
   771  		t.Fatalf("cannot chmod %q as 000: %v", te.sleepConf.RootDir, err)
   772  	}
   773  	defer os.Chmod(te.sleepConf.RootDir, prevStat.Mode())
   774  	if _, err := ioutil.ReadDir(te.sleepConf.RootDir); err == nil {
   775  		t.Logf("Can still read directory %v despite chmodding it to 0. Maybe we are running as root? Skipping test.", te.sleepConf.RootDir)
   776  		return
   777  	}
   778  	shorterCtx, shorterCtxCancel := context.WithTimeout(te.testCtx, time.Second)
   779  	defer shorterCtxCancel()
   780  	if err := te.client.SpawnServer(shorterCtx, te.sleepConf, te.serverExtraArgs...); err == nil {
   781  		t.Error("Metric server was successfully able to be spawned despite not having access to the root directory")
   782  	}
   783  }
   784  
   785  func TestMetricServerToleratesNoRootDirectory(t *testing.T) {
   786  	te, cleanup := setupMetrics(t /* forceTempUDS= */, true)
   787  	defer cleanup()
   788  	if err := te.client.ShutdownServer(te.testCtx); err != nil {
   789  		t.Fatalf("Cannot stop metric server: %v", err)
   790  	}
   791  	if err := os.RemoveAll(te.sleepConf.RootDir); err != nil {
   792  		t.Fatalf("cannot remove root directory %q: %v", te.sleepConf.RootDir, err)
   793  	}
   794  	shortCtx, shortCtxCancel := context.WithTimeout(te.testCtx, time.Second)
   795  	defer shortCtxCancel()
   796  	if err := te.client.SpawnServer(shortCtx, te.sleepConf, append([]string{"--allow-unknown-root=false"}, te.serverExtraArgs...)...); err == nil {
   797  		t.Fatalf("Metric server was successfully able to be spawned despite a non-existent root directory")
   798  	}
   799  	if err := te.client.SpawnServer(te.testCtx, te.sleepConf, append([]string{"--allow-unknown-root=true"}, te.serverExtraArgs...)...); err != nil {
   800  		t.Errorf("Metric server was not able to be spawned despite being configured to tolerate a non-existent root directory: %v", err)
   801  	}
   802  }
   803  
   804  func TestMetricServerDoesNotExportZeroValueCounters(t *testing.T) {
   805  	te, cleanup := setupMetrics(t, false /* forceTempUDS */)
   806  	defer cleanup()
   807  	app, err := testutil.FindFile("test/cmd/test_app/test_app")
   808  	if err != nil {
   809  		t.Fatalf("error finding test_app: %v", err)
   810  	}
   811  	unimpl1Spec := testutil.NewSpecWithArgs("sh", "-c", fmt.Sprintf("%s syscall --syscall=1337; sleep 1h", app))
   812  	unimpl1Conf := te.applyConf(testutil.TestConfig(t))
   813  	unimpl1Bundle, cleanup, err := testutil.SetupBundleDir(unimpl1Spec)
   814  	if err != nil {
   815  		t.Fatalf("error setting up container: %v", err)
   816  	}
   817  	defer cleanup()
   818  	unimpl2Spec := testutil.NewSpecWithArgs("sh", "-c", fmt.Sprintf("%s syscall --syscall=1338; sleep 1h", app))
   819  	unimpl2Conf := te.applyConf(testutil.TestConfig(t))
   820  	unimpl2Bundle, cleanup, err := testutil.SetupBundleDir(unimpl2Spec)
   821  	if err != nil {
   822  		t.Fatalf("error setting up container: %v", err)
   823  	}
   824  	defer cleanup()
   825  	unimpl1, err := New(unimpl1Conf, Args{
   826  		ID:        testutil.RandomContainerID(),
   827  		Spec:      unimpl1Spec,
   828  		BundleDir: unimpl1Bundle,
   829  	})
   830  	if err != nil {
   831  		t.Fatalf("error creating first container: %v", err)
   832  	}
   833  	defer unimpl1.Destroy()
   834  	if err := unimpl1.Start(unimpl1Conf); err != nil {
   835  		t.Fatalf("Cannot start first container: %v", err)
   836  	}
   837  	unimpl2, err := New(unimpl2Conf, Args{
   838  		ID:        testutil.RandomContainerID(),
   839  		Spec:      unimpl2Spec,
   840  		BundleDir: unimpl2Bundle,
   841  	})
   842  	if err != nil {
   843  		t.Fatalf("error creating second container: %v", err)
   844  	}
   845  	defer unimpl2.Destroy()
   846  	if err := unimpl2.Start(unimpl2Conf); err != nil {
   847  		t.Fatalf("Cannot start second container: %v", err)
   848  	}
   849  	metricData, err := te.client.GetMetrics(te.testCtx, nil)
   850  	if err != nil {
   851  		t.Fatalf("Cannot get metrics: %v", err)
   852  	}
   853  	metricDataPtr := &metricData
   854  
   855  	// For this test to work, it must wait for long enough such that the containers have
   856  	// actually tried to call the unimplemented syscall so that it shows up in metrics.
   857  	waitCtx, waitCtxCancel := context.WithTimeout(te.testCtx, 50*time.Second)
   858  	defer waitCtxCancel()
   859  
   860  	for _, test := range []struct {
   861  		cont          *Container
   862  		sysno         uintptr
   863  		wantExistence bool
   864  	}{
   865  		{unimpl1, 1337, true},
   866  		{unimpl1, 1338, false},
   867  		{unimpl2, 1337, false},
   868  		{unimpl2, 1338, true},
   869  	} {
   870  		t.Run(fmt.Sprintf("container %s syscall %d", test.cont.ID, test.sysno), func(t *testing.T) {
   871  			check := func() error {
   872  				got, _, err := metricDataPtr.GetPrometheusContainerInteger(metricclient.WantMetric{
   873  					Metric:      "testmetric_unimplemented_syscalls",
   874  					Sandbox:     test.cont.sandboxID(),
   875  					ExtraLabels: map[string]string{"sysno": strconv.Itoa(int(test.sysno))},
   876  				})
   877  				if test.wantExistence {
   878  					if err != nil {
   879  						return fmt.Errorf("cannot get unimplemented syscall metric for sysno=%d even though we expected its presence: %v", test.sysno, err)
   880  					}
   881  					if got != 1 {
   882  						return fmt.Errorf("expected counter value for unimplemented syscall %d be exactly 1, got %d", test.sysno, got)
   883  					}
   884  				} else /* !test.wantExistence */ {
   885  					if err == nil {
   886  						return fmt.Errorf("unimplemented syscall metric for sysno=%d was unexpectedly present (value: %d)", test.sysno, got)
   887  					}
   888  				}
   889  				return nil
   890  			}
   891  			for waitCtx.Err() == nil {
   892  				if check() == nil {
   893  					break
   894  				}
   895  				select {
   896  				case <-time.After(20 * time.Millisecond):
   897  					newMetricData, err := te.client.GetMetrics(te.testCtx, nil)
   898  					if err != nil {
   899  						t.Fatalf("Cannot get metrics: %v", err)
   900  					}
   901  					*metricDataPtr = newMetricData
   902  				case <-waitCtx.Done():
   903  				}
   904  			}
   905  			if err := check(); err != nil {
   906  				t.Error(err.Error())
   907  			}
   908  		})
   909  	}
   910  	if t.Failed() {
   911  		t.Logf("Last metric data:\n\n%s\n\n", metricData)
   912  	}
   913  }