gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/metricserver/metricserver.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/metricserver/metricserver.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package metricserver implements a Prometheus metric server for runsc data.
    16  package metricserver
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"fmt"
    22  	"io/ioutil"
    23  	"math/rand"
    24  	"net"
    25  	"net/http"
    26  	"os"
    27  	"os/signal"
    28  	"regexp"
    29  	"runtime"
    30  	"runtime/debug"
    31  	"strconv"
    32  	"strings"
    33  	"syscall"
    34  	"time"
    35  
    36  	"gvisor.dev/gvisor/pkg/abi/linux"
    37  	"gvisor.dev/gvisor/pkg/atomicbitops"
    38  	"gvisor.dev/gvisor/pkg/log"
    39  	"gvisor.dev/gvisor/pkg/prometheus"
    40  	"gvisor.dev/gvisor/pkg/sentry/control"
    41  	"gvisor.dev/gvisor/pkg/state"
    42  	"gvisor.dev/gvisor/pkg/sync"
    43  	"gvisor.dev/gvisor/runsc/config"
    44  	"gvisor.dev/gvisor/runsc/container"
    45  	"gvisor.dev/gvisor/runsc/metricserver/containermetrics"
    46  	"gvisor.dev/gvisor/runsc/sandbox"
    47  )
    48  
    49  const (
    50  	// metricsExportTimeout is the maximum amount of time that the metrics export process should take.
    51  	metricsExportTimeout = 30 * time.Second
    52  
    53  	// metricsExportPerSandboxTimeout is the maximum amount of time that we wait on any individual
    54  	// sandbox when exporting its metrics.
    55  	metricsExportPerSandboxTimeout = 8 * time.Second
    56  
    57  	// exportParallelGoroutines is the maximum number of goroutines spawned during metrics export.
    58  	exportParallelGoroutines = 8
    59  )
    60  
    61  // servedSandbox is a sandbox that we serve metrics from.
    62  // A single metrics server will export data about multiple sandboxes.
    63  type servedSandbox struct {
    64  	rootContainerID container.FullID
    65  	server          *metricServer
    66  	extraLabels     map[string]string
    67  
    68  	// mu protects the fields below.
    69  	mu sync.Mutex
    70  
    71  	// sandbox is the sandbox being monitored.
    72  	// Once set, it is immutable.
    73  	sandbox *sandbox.Sandbox
    74  
    75  	// createdAt stores the time the sandbox was created.
    76  	// It is loaded from the container state file.
    77  	// Once set, it is immutable.
    78  	createdAt time.Time
    79  
    80  	// capabilities is the union of the capability set of the containers within `sandbox`.
    81  	// It is used to export a per-sandbox metric representing which capabilities are in use.
    82  	// For monitoring purposes, a capability added in a container means it is considered
    83  	// added for the whole sandbox.
    84  	capabilities []linux.Capability
    85  
    86  	// specMetadataLabels is the set of label exported as part of the
    87  	// `spec_metadata` metric.
    88  	specMetadataLabels map[string]string
    89  
    90  	// verifier allows verifying the data integrity of the metrics we get from this sandbox.
    91  	// It is not always initialized when the sandbox is discovered, but rather upon first metrics
    92  	// access to the sandbox. Metric registration data is loaded from the root container's
    93  	// state file.
    94  	// The server needs to load this registration data before any data from this sandbox is
    95  	// served to HTTP clients. If there is no metric registration data within the Container
    96  	// data, then metrics were not requested for this sandbox, and this servedSandbox should
    97  	// be deleted from the server.
    98  	// Once set, it is immutable.
    99  	verifier *prometheus.Verifier
   100  
   101  	// cleanupVerifier holds a reference to the cleanup function of the verifier.
   102  	cleanupVerifier func()
   103  
   104  	// extra contains additional per-sandbox data.
   105  	extra sandboxData
   106  }
   107  
   108  // load loads the sandbox being monitored and initializes its metric verifier.
   109  // If it returns an error other than container.ErrStateFileLocked, the sandbox is either
   110  // non-existent, or has not requested instrumentation to be enabled, or does not have
   111  // valid metric registration data. In any of these cases, the sandbox should be removed
   112  // from this metrics server.
   113  func (s *servedSandbox) load() (*sandbox.Sandbox, *prometheus.Verifier, error) {
   114  	s.mu.Lock()
   115  	defer s.mu.Unlock()
   116  	if s.sandbox == nil {
   117  		allContainers, err := container.LoadSandbox(s.server.rootDir, s.rootContainerID.SandboxID, container.LoadOpts{
   118  			TryLock: container.TryAcquire,
   119  		})
   120  		if err != nil {
   121  			return nil, nil, fmt.Errorf("cannot load sandbox %q: %v", s.rootContainerID.SandboxID, err)
   122  		}
   123  		var rootContainer *container.Container
   124  		for _, cont := range allContainers {
   125  			if cont.IsSandboxRoot() {
   126  				if rootContainer != nil {
   127  					return nil, nil, fmt.Errorf("multiple root contains found for sandbox ID %q: %v and %v", s.rootContainerID.SandboxID, cont, rootContainer)
   128  				}
   129  				rootContainer = cont
   130  			}
   131  		}
   132  		if rootContainer == nil {
   133  			return nil, nil, fmt.Errorf("no root container found for sandbox ID %q", s.rootContainerID.SandboxID)
   134  		}
   135  		sandboxMetricAddr := strings.ReplaceAll(rootContainer.Sandbox.MetricServerAddress, "%RUNTIME_ROOT%", s.server.rootDir)
   136  		if sandboxMetricAddr == "" {
   137  			return nil, nil, errors.New("sandbox did not request instrumentation")
   138  		}
   139  		if sandboxMetricAddr != s.server.address {
   140  			return nil, nil, fmt.Errorf("sandbox requested instrumentation by a metric server running at a different address (sandbox wants %q, this metric server serves %q)", sandboxMetricAddr, s.server.address)
   141  		}
   142  		// Update label data as read from the state file.
   143  		// Do not store empty labels.
   144  		authoritativeLabels, err := containermetrics.SandboxPrometheusLabels(rootContainer)
   145  		if err != nil {
   146  			return nil, nil, fmt.Errorf("cannot compute Prometheus labels of sandbox: %v", err)
   147  		}
   148  		s.extraLabels = make(map[string]string, len(authoritativeLabels))
   149  		for _, label := range []string{
   150  			prometheus.SandboxIDLabel,
   151  			prometheus.IterationIDLabel,
   152  			prometheus.PodNameLabel,
   153  			prometheus.NamespaceLabel,
   154  		} {
   155  			s.extraLabels[label] = authoritativeLabels[label]
   156  			if s.extraLabels[label] == "" {
   157  				delete(s.extraLabels, label)
   158  			}
   159  		}
   160  
   161  		// Compute capability set.
   162  		allCaps := linux.AllCapabilities()
   163  		capSet := make([]linux.Capability, 0, len(allCaps))
   164  		for _, cap := range allCaps {
   165  			for _, cont := range allContainers {
   166  				if cont.HasCapabilityInAnySet(cap) {
   167  					capSet = append(capSet, cap)
   168  					break
   169  				}
   170  			}
   171  		}
   172  		if len(capSet) > 0 {
   173  			// Reallocate a slice with minimum size, since it will be long-lived.
   174  			s.capabilities = make([]linux.Capability, len(capSet))
   175  			for i, capLabels := range capSet {
   176  				s.capabilities[i] = capLabels
   177  			}
   178  		}
   179  
   180  		// Compute spec metadata.
   181  		s.specMetadataLabels = containermetrics.ComputeSpecMetadata(allContainers)
   182  
   183  		s.sandbox = rootContainer.Sandbox
   184  		s.createdAt = rootContainer.CreatedAt
   185  	}
   186  	if s.verifier == nil {
   187  		registeredMetrics, err := s.sandbox.GetRegisteredMetrics()
   188  		if err != nil {
   189  			return nil, nil, err
   190  		}
   191  		verifier, cleanup, err := prometheus.NewVerifier(registeredMetrics)
   192  		if err != nil {
   193  			return nil, nil, err
   194  		}
   195  		s.verifier = verifier
   196  		s.cleanupVerifier = cleanup
   197  	}
   198  	if err := s.extra.load(s); err != nil {
   199  		return nil, nil, err
   200  	}
   201  	return s.sandbox, s.verifier, nil
   202  }
   203  
   204  func (s *servedSandbox) cleanup() {
   205  	s.mu.Lock()
   206  	defer s.mu.Unlock()
   207  	if s.cleanupVerifier != nil {
   208  		s.cleanupVerifier()
   209  	}
   210  }
   211  
   212  // querySandboxMetrics queries the sandbox for metrics data.
   213  func querySandboxMetrics(ctx context.Context, sand *sandbox.Sandbox, verifier *prometheus.Verifier, metricsFilter string) (*prometheus.Snapshot, error) {
   214  	ch := make(chan struct {
   215  		snapshot *prometheus.Snapshot
   216  		err      error
   217  	}, 1)
   218  	canceled := make(chan struct{}, 1)
   219  	defer close(canceled)
   220  	go func() {
   221  		snapshot, err := sand.ExportMetrics(control.MetricsExportOpts{
   222  			OnlyMetrics: metricsFilter,
   223  		})
   224  		select {
   225  		case <-canceled:
   226  		case ch <- struct {
   227  			snapshot *prometheus.Snapshot
   228  			err      error
   229  		}{snapshot, err}:
   230  			close(ch)
   231  		}
   232  	}()
   233  	select {
   234  	case <-ctx.Done():
   235  		canceled <- struct{}{}
   236  		return nil, ctx.Err()
   237  	case ret := <-ch:
   238  		if ret.err != nil {
   239  			return nil, ret.err
   240  		}
   241  		if err := verifier.Verify(ret.snapshot); err != nil {
   242  			return nil, err
   243  		}
   244  		return ret.snapshot, nil
   245  	}
   246  }
   247  
   248  // metricServer implements the metric server.
   249  type metricServer struct {
   250  	rootDir                string
   251  	pid                    int
   252  	pidFile                string
   253  	allowUnknownRoot       bool
   254  	exposeProfileEndpoints bool
   255  	address                string
   256  	exporterPrefix         string
   257  	startTime              time.Time
   258  	srv                    http.Server
   259  
   260  	// Size of the map of written metrics during the last /metrics export. Initially zero.
   261  	// Used to efficiently reallocate a map of the right size during the next export.
   262  	lastMetricsWrittenSize atomicbitops.Uint32
   263  
   264  	// Pool of `prometheus.ReusableWriter`s. Used to avoid large buffer allocations for
   265  	// successive snapshots.
   266  	promWriterPool sync.Pool
   267  
   268  	// mu protects the fields below.
   269  	mu sync.Mutex
   270  
   271  	// udsPath is a path to a Unix Domain Socket file on which the server is bound and which it owns.
   272  	// This socket file will be deleted on server shutdown.
   273  	// This field is not set if binding to a network port, or when the UDS already existed prior to
   274  	// being bound by us (i.e. its ownership isn't ours), such that it isn't deleted in this case.
   275  	// The field is unset once the file is successfully removed.
   276  	udsPath string
   277  
   278  	// sandboxes is the list of sandboxes we serve metrics for.
   279  	sandboxes map[container.FullID]*servedSandbox
   280  
   281  	// lastStateFileStat maps container full IDs to the last observed stat() of their state file.
   282  	// This is used to monitor for sandboxes in the background. If a sandbox's state file matches this
   283  	// info, we can assume that the last background scan already looked at it.
   284  	lastStateFileStat map[container.FullID]os.FileInfo
   285  
   286  	// lastValidMetricFilter stores the last value of the "runsc-sandbox-metrics-filter" parameter for
   287  	// /metrics requests.
   288  	// It represents the last-known compilable regular expression that was passed to /metrics.
   289  	// It is used to avoid re-verifying this parameter in the common case where a single scraper
   290  	// is consistently passing in the same value for this parameter in each successive request.
   291  	lastValidMetricFilter string
   292  
   293  	// lastValidCapabilityFilterStr stores the last value of the "runsc-capability-filter" parameter
   294  	// for /metrics requests.
   295  	// It represents the last-known compilable regular expression that was passed to /metrics.
   296  	// It is used to avoid re-verifying this parameter in the common case where a single scraper
   297  	// is consistently passing in the same value for this parameter in each successive request.
   298  	lastValidCapabilityFilterStr string
   299  
   300  	// lastValidCapabilityFilterReg is the compiled regular expression corresponding to
   301  	// lastValidCapabilityFilterStr.
   302  	lastValidCapabilityFilterReg *regexp.Regexp
   303  
   304  	// numSandboxes counts the number of sandboxes that have ever been registered on this server.
   305  	// Used to distinguish between the case where this metrics serve has sat there doing nothing
   306  	// because no sandbox ever registered against it (which is unexpected), vs the case where it has
   307  	// done a good job serving sandbox metrics and it's time for it to gracefully die as there are no
   308  	// more sandboxes to serve.
   309  	// Also exported as a metric of total number of sandboxes started.
   310  	numSandboxes int64
   311  
   312  	// shuttingDown is flipped to true when the server shutdown process has started.
   313  	// Used to deal with race conditions where a sandbox is trying to register after the server has
   314  	// already started to go to sleep.
   315  	shuttingDown bool
   316  
   317  	// shutdownCh is written to when receiving the signal to shut down gracefully.
   318  	shutdownCh chan os.Signal
   319  
   320  	// extraData contains additional server-wide data.
   321  	extra serverData
   322  }
   323  
   324  // sufficientlyEqualStats returns whether the given FileInfo's are sufficiently
   325  // equal to assume the file they represent has not changed between the time
   326  // each FileInfo was obtained.
   327  func sufficientlyEqualStats(s1, s2 os.FileInfo) bool {
   328  	if !s1.ModTime().Equal(s2.ModTime()) {
   329  		return false
   330  	}
   331  	if s1.Size() != s2.Size() {
   332  		return false
   333  	}
   334  	statT1, ok1 := s1.Sys().(*syscall.Stat_t)
   335  	statT2, ok2 := s2.Sys().(*syscall.Stat_t)
   336  	if ok1 != ok2 {
   337  		return false
   338  	}
   339  	if ok1 && ok2 {
   340  		if statT1.Dev != statT2.Dev {
   341  			return false
   342  		}
   343  		if statT1.Ino != statT2.Ino {
   344  			return false
   345  		}
   346  	}
   347  	return true
   348  }
   349  
   350  // refreshSandboxesLocked removes sandboxes that are no longer running from m.sandboxes, and
   351  // adds sandboxes found in the root directory that do request instrumentation.
   352  // Preconditions: m.mu is locked.
   353  func (m *metricServer) refreshSandboxesLocked() {
   354  	if m.shuttingDown {
   355  		// Do nothing to avoid log spam.
   356  		return
   357  	}
   358  	sandboxIDs, err := container.ListSandboxes(m.rootDir)
   359  	if err != nil {
   360  		if !m.allowUnknownRoot {
   361  			log.Warningf("Cannot list containers in root directory %s, it has likely gone away: %v.", m.rootDir, err)
   362  		}
   363  		return
   364  	}
   365  	for sandboxID, sandbox := range m.sandboxes {
   366  		found := false
   367  		for _, sid := range sandboxIDs {
   368  			if sid == sandboxID {
   369  				found = true
   370  				break
   371  			}
   372  		}
   373  		if !found {
   374  			log.Warningf("Sandbox %s no longer exists but did not explicitly unregister. Removing it.", sandboxID)
   375  			sandbox.cleanup()
   376  			delete(m.sandboxes, sandboxID)
   377  			continue
   378  		}
   379  		if _, _, err := sandbox.load(); err != nil && err != container.ErrStateFileLocked {
   380  			log.Warningf("Sandbox %s cannot be loaded, deleting it: %v", sandboxID, err)
   381  			sandbox.cleanup()
   382  			delete(m.sandboxes, sandboxID)
   383  			continue
   384  		}
   385  		if !sandbox.sandbox.IsRunning() {
   386  			log.Infof("Sandbox %s is no longer running, deleting it.", sandboxID)
   387  			sandbox.cleanup()
   388  			delete(m.sandboxes, sandboxID)
   389  			continue
   390  		}
   391  	}
   392  	newSandboxIDs := make(map[container.FullID]bool, len(sandboxIDs))
   393  	for _, sid := range sandboxIDs {
   394  		if _, found := m.sandboxes[sid]; found {
   395  			continue
   396  		}
   397  		newSandboxIDs[sid] = true
   398  	}
   399  	for sid := range m.lastStateFileStat {
   400  		if _, found := newSandboxIDs[sid]; !found {
   401  			delete(m.lastStateFileStat, sid)
   402  		}
   403  	}
   404  	for sid := range newSandboxIDs {
   405  		stateFile := container.StateFile{
   406  			RootDir: m.rootDir,
   407  			ID:      sid,
   408  		}
   409  		stat, err := stateFile.Stat()
   410  		if err != nil {
   411  			log.Warningf("Failed to stat() container state file for sandbox %q: %v", sid, err)
   412  			continue
   413  		}
   414  		if existing, found := m.lastStateFileStat[sid]; found {
   415  			// We already tried to stat this sandbox but decided not to pick it up.
   416  			// Check if the state file changed since. If it didn't, we don't want to
   417  			// try again.
   418  			if sufficientlyEqualStats(existing, stat) {
   419  				continue
   420  			}
   421  			log.Infof("State file for sandbox %q has changed since we last looked at it; will try to reload it.", sid)
   422  			delete(m.lastStateFileStat, sid)
   423  		}
   424  		// If we get here, we either haven't seen this sandbox before, or we saw it
   425  		// and it has disappeared (which means it is new in this iteration), or we
   426  		// saw it before but its state file changed. Either way, we want to try
   427  		// loading it and see if it wants instrumentation.
   428  		cont, err := container.Load(m.rootDir, sid, container.LoadOpts{
   429  			Exact:         true,
   430  			SkipCheck:     true,
   431  			TryLock:       container.TryAcquire,
   432  			RootContainer: true,
   433  		})
   434  		if err != nil {
   435  			if err == container.ErrStateFileLocked {
   436  				// This error is OK and shouldn't generate log spam. The sandbox is probably in the middle
   437  				// of being created.
   438  				continue
   439  			}
   440  			log.Warningf("Cannot load state file for sandbox %q: %v", sid, err)
   441  			continue
   442  		}
   443  
   444  		// This is redundant with one of the checks performed below in servedSandbox.load, but this
   445  		// avoids log spam for the non-error case of sandboxes that didn't request instrumentation.
   446  		sandboxMetricAddr := strings.ReplaceAll(cont.Sandbox.MetricServerAddress, "%RUNTIME_ROOT%", m.rootDir)
   447  		if sandboxMetricAddr != m.address {
   448  			m.lastStateFileStat[sid] = stat
   449  			continue
   450  		}
   451  
   452  		// This case can be hit when there is a leftover state file for a sandbox that was `kill -9`'d
   453  		// without an opportunity for it to clean up its state file. This results in a valid state file
   454  		// but the sandbox PID is gone. We don't want to continuously load this sandbox's state file.
   455  		if cont.Status == container.Running && !cont.Sandbox.IsRunning() {
   456  			log.Warningf("Sandbox %q has state file in state Running, yet it isn't actually running. Ignoring it.", sid)
   457  			m.lastStateFileStat[sid] = stat
   458  			continue
   459  		}
   460  
   461  		m.numSandboxes++
   462  		served := &servedSandbox{
   463  			rootContainerID: sid,
   464  			server:          m,
   465  			extraLabels: map[string]string{
   466  				prometheus.SandboxIDLabel: sid.SandboxID,
   467  			},
   468  		}
   469  		// Best-effort attempt to load the state file instantly.
   470  		// This may legitimately fail if it is locked, e.g. during sandbox startup.
   471  		// If it fails for any other reason, then the sandbox went away between the time we listed the
   472  		// sandboxes and now, so just delete it.
   473  		if _, _, err := served.load(); err != nil && err != container.ErrStateFileLocked {
   474  			log.Warningf("Sandbox %q cannot be loaded, ignoring it: %v", sid, err)
   475  			m.lastStateFileStat[sid] = stat
   476  			served.cleanup()
   477  			continue
   478  		}
   479  		m.sandboxes[sid] = served
   480  		log.Infof("Registered new sandbox found in root directory: %q", sid)
   481  	}
   482  }
   483  
   484  // sandboxLoadResult contains the outcome of calling `load` on a `servedSandbox`.
   485  // It is used as an intermediary type that contains all that we know about a
   486  // sandbox after attempting to load its state file, but does not contain any
   487  // metric data from the sandbox.
   488  type sandboxLoadResult struct {
   489  	served   *servedSandbox
   490  	sandbox  *sandbox.Sandbox
   491  	verifier *prometheus.Verifier
   492  	err      error
   493  }
   494  
   495  // loadSandboxesLocked loads the state file data from all known sandboxes.
   496  // It does so in parallel, and avoids reloading sandboxes for which we have
   497  // already loaded data.
   498  func (m *metricServer) loadSandboxesLocked(ctx context.Context) []sandboxLoadResult {
   499  	m.refreshSandboxesLocked()
   500  
   501  	numGoroutines := exportParallelGoroutines
   502  	numSandboxes := len(m.sandboxes)
   503  	if numSandboxes < numGoroutines {
   504  		numGoroutines = numSandboxes
   505  	}
   506  
   507  	// First, load all the sandboxes in parallel. We need to do this while m.mu is held.
   508  	loadSandboxCh := make(chan *servedSandbox, numSandboxes)
   509  	loadedSandboxesCh := make(chan sandboxLoadResult, numSandboxes)
   510  	loadedSandboxes := make([]sandboxLoadResult, 0, numSandboxes)
   511  	for i := 0; i < numGoroutines; i++ {
   512  		go func() {
   513  			for served := range loadSandboxCh {
   514  				sand, verifier, err := served.load()
   515  				loadedSandboxesCh <- sandboxLoadResult{served, sand, verifier, err}
   516  			}
   517  		}()
   518  	}
   519  	for _, sandbox := range m.sandboxes {
   520  		loadSandboxCh <- sandbox
   521  	}
   522  	close(loadSandboxCh)
   523  	for i := 0; i < numSandboxes; i++ {
   524  		loadedSandboxes = append(loadedSandboxes, <-loadedSandboxesCh)
   525  	}
   526  	close(loadedSandboxesCh)
   527  	return loadedSandboxes
   528  }
   529  
   530  // sandboxMetricsResult is the result of calling querySandboxMetrics on a
   531  // single sandbox. It contains all of `sandboxLoadResult` but also has current
   532  // metric data (if querying metrics from the sandbox process succeeded).
   533  type sandboxMetricsResult struct {
   534  	sandboxLoadResult
   535  	isRunning bool
   536  	snapshot  *prometheus.Snapshot
   537  	err       error
   538  }
   539  
   540  // queryMultiSandboxMetrics queries metric data from multiple loaded sandboxes.
   541  // It does so in parallel and with random permutation ordering.
   542  // Only metrics matching the `metricsFilter` regular expression are queried.
   543  // For each sandbox, whether we were successful in querying its metrics or
   544  // not, the `processSandbox` function is called. This may be done in parallel,
   545  // so `processSandbox` should do its own locking so that multiple parallel
   546  // instances of itself behave appropriately.
   547  func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoadResult, metricsFilter string, processSandbox func(sandboxMetricsResult)) {
   548  	numSandboxes := len(loadedSandboxes)
   549  	ctxDeadline, ok := ctx.Deadline()
   550  	if !ok {
   551  		panic("context had no deadline, this should never happen as it was created with a timeout")
   552  	}
   553  	exportStartTime := time.Now()
   554  	requestTimeLeft := ctxDeadline.Sub(exportStartTime)
   555  	perSandboxTime := requestTimeLeft
   556  	if numSandboxes != 0 {
   557  		perSandboxTime = requestTimeLeft / time.Duration(numSandboxes)
   558  	}
   559  	if perSandboxTime < metricsExportPerSandboxTimeout {
   560  		perSandboxTime = metricsExportPerSandboxTimeout
   561  	}
   562  	loadedSandboxCh := make(chan sandboxLoadResult, numSandboxes)
   563  	var wg sync.WaitGroup
   564  	numGoroutines := exportParallelGoroutines
   565  	if numSandboxes < numGoroutines {
   566  		numGoroutines = numSandboxes
   567  	}
   568  	wg.Add(numGoroutines)
   569  	for i := 0; i < numGoroutines; i++ {
   570  		go func() {
   571  			defer wg.Done()
   572  			for s := range loadedSandboxCh {
   573  				isRunning := false
   574  				var snapshot *prometheus.Snapshot
   575  				err := s.err
   576  				if err == nil {
   577  					queryCtx, queryCtxCancel := context.WithTimeout(ctx, perSandboxTime)
   578  					snapshot, err = querySandboxMetrics(queryCtx, s.sandbox, s.verifier, metricsFilter)
   579  					queryCtxCancel()
   580  					isRunning = s.sandbox.IsRunning()
   581  				}
   582  				processSandbox(sandboxMetricsResult{
   583  					sandboxLoadResult: s,
   584  					isRunning:         isRunning,
   585  					snapshot:          snapshot,
   586  					err:               err,
   587  				})
   588  			}
   589  		}()
   590  	}
   591  	// Iterate over all sandboxes.
   592  	// Important: This must be done in random order.
   593  	// A malicious/compromised sandbox may decide to stall when being asked for metrics.
   594  	// If at least `numGoroutines` sandboxes do this, this will starve other sandboxes
   595  	// from having their metrics exported, because all the goroutines will be stuck on
   596  	// the stalled sandboxes.
   597  	// One way to completely avoid this would be to spawn one goroutine per
   598  	// sandbox, but this can amount to ~hundreds of goroutines, which is not desirable
   599  	// for the metrics server.
   600  	// Another way would be to have a very strict timeout on each sandbox's export
   601  	// process, but in some cases a busy sandbox will take more than a decisecond
   602  	// or so to export its data, so this would miss some data from legitimate (but
   603  	// slow) sandboxes.
   604  	// Instead, we take a middle-of-the-road approach: we use a timeout that's not
   605  	// too strict but still ensures we make forward progress away from stalled
   606  	// sandboxes, and we also iterate across sandboxes in a different random order at
   607  	// each export. This ensures that all sandboxes eventually get a fair chance of
   608  	// being part of the "first `numGoroutines` sandboxes in line" to get their
   609  	// metric data loaded, such that a client repeatedly scraping metrics will
   610  	// eventually get data from each sandbox.
   611  	for _, sandboxIndex := range rand.Perm(len(loadedSandboxes)) {
   612  		loadedSandboxCh <- loadedSandboxes[sandboxIndex]
   613  	}
   614  	close(loadedSandboxCh)
   615  	wg.Wait()
   616  }
   617  
   618  // serveMetrics serves metrics requests.
   619  func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) httpResult {
   620  	ctx, ctxCancel := context.WithTimeout(req.Context(), metricsExportTimeout)
   621  	defer ctxCancel()
   622  
   623  	metricsFilter := req.URL.Query().Get("runsc-sandbox-metrics-filter")
   624  	var capabilityFilterReg *regexp.Regexp
   625  	capabilityFilterStr := req.URL.Query().Get("runsc-capability-filter")
   626  
   627  	m.mu.Lock()
   628  
   629  	if metricsFilter != "" && metricsFilter != m.lastValidMetricFilter {
   630  		_, err := regexp.Compile(metricsFilter)
   631  		if err != nil {
   632  			m.mu.Unlock()
   633  			return httpResult{http.StatusBadRequest, errors.New("provided metric filter is not a valid regular expression")}
   634  		}
   635  		m.lastValidMetricFilter = metricsFilter
   636  	}
   637  	if capabilityFilterStr != "" {
   638  		if capabilityFilterStr != m.lastValidCapabilityFilterStr {
   639  			reg, err := regexp.Compile(capabilityFilterStr)
   640  			if err != nil {
   641  				m.mu.Unlock()
   642  				return httpResult{http.StatusBadRequest, errors.New("provided capability filter is not a valid regular expression")}
   643  			}
   644  			m.lastValidCapabilityFilterStr = capabilityFilterStr
   645  			m.lastValidCapabilityFilterReg = reg
   646  			capabilityFilterReg = reg
   647  		} else {
   648  			capabilityFilterReg = m.lastValidCapabilityFilterReg
   649  		}
   650  	}
   651  
   652  	loadedSandboxes := m.loadSandboxesLocked(ctx)
   653  	numSandboxes := len(loadedSandboxes)
   654  	numSandboxesTotal := m.numSandboxes
   655  	m.mu.Unlock()
   656  
   657  	// Used to prevent goroutines from accessing the shared variables below.
   658  	var metricsMu sync.Mutex
   659  
   660  	// Meta-metrics keep track of metrics to export about the metrics server itself.
   661  	type metaMetrics struct {
   662  		numRunningSandboxes      int64
   663  		numCannotExportSandboxes int64
   664  	}
   665  	meta := metaMetrics{}                   // Protected by metricsMu.
   666  	selfMetrics := prometheus.NewSnapshot() // Protected by metricsMu.
   667  
   668  	type snapshotAndOptions struct {
   669  		snapshot *prometheus.Snapshot
   670  		options  prometheus.SnapshotExportOptions
   671  	}
   672  	snapshotCh := make(chan snapshotAndOptions, numSandboxes)
   673  
   674  	queryMultiSandboxMetrics(ctx, loadedSandboxes, metricsFilter, func(r sandboxMetricsResult) {
   675  		metricsMu.Lock()
   676  		defer metricsMu.Unlock()
   677  		selfMetrics.Add(prometheus.LabeledIntData(&SandboxPresenceMetric, nil, 1).SetExternalLabels(r.served.extraLabels))
   678  		sandboxRunning := int64(0)
   679  		if r.isRunning {
   680  			sandboxRunning = 1
   681  			meta.numRunningSandboxes++
   682  		}
   683  		selfMetrics.Add(prometheus.LabeledIntData(&SandboxRunningMetric, nil, sandboxRunning).SetExternalLabels(r.served.extraLabels))
   684  		if r.err == nil {
   685  			selfMetrics.Add(prometheus.LabeledIntData(&SandboxMetadataMetric, r.sandbox.MetricMetadata, 1).SetExternalLabels(r.served.extraLabels))
   686  			for _, cap := range r.served.capabilities {
   687  				if capabilityFilterReg != nil && !capabilityFilterReg.MatchString(cap.String()) && !capabilityFilterReg.MatchString(cap.TrimmedString()) {
   688  					continue
   689  				}
   690  				selfMetrics.Add(prometheus.LabeledIntData(&SandboxCapabilitiesMetric, map[string]string{
   691  					SandboxCapabilitiesMetricLabel: cap.TrimmedString(),
   692  				}, 1).SetExternalLabels(r.served.extraLabels))
   693  			}
   694  			selfMetrics.Add(prometheus.LabeledIntData(&SpecMetadataMetric, r.served.specMetadataLabels, 1).SetExternalLabels(r.served.extraLabels))
   695  			createdAt := float64(r.served.createdAt.Unix()) + (float64(r.served.createdAt.Nanosecond()) / 1e9)
   696  			selfMetrics.Add(prometheus.LabeledFloatData(&SandboxCreationMetric, nil, createdAt).SetExternalLabels(r.served.extraLabels))
   697  		} else {
   698  			// If the sandbox isn't running, it is normal that metrics are not exported for it, so
   699  			// do not report this case as an error.
   700  			if r.isRunning {
   701  				meta.numCannotExportSandboxes++
   702  				log.Warningf("Could not export metrics from sandbox %s: %v", r.served.rootContainerID.SandboxID, r.err)
   703  			}
   704  			return
   705  		}
   706  		snapshotCh <- snapshotAndOptions{
   707  			snapshot: r.snapshot,
   708  			options: prometheus.SnapshotExportOptions{
   709  				ExporterPrefix: m.exporterPrefix,
   710  				ExtraLabels:    r.served.extraLabels,
   711  			},
   712  		}
   713  	})
   714  
   715  	// Build the map of all snapshots we will be rendering.
   716  	snapshotsToOptions := make(map[*prometheus.Snapshot]prometheus.SnapshotExportOptions, numSandboxes+2)
   717  	snapshotsToOptions[selfMetrics] = prometheus.SnapshotExportOptions{
   718  		ExporterPrefix: fmt.Sprintf("%s%s", m.exporterPrefix, prometheus.MetaMetricPrefix),
   719  	}
   720  	processMetrics := prometheus.NewSnapshot()
   721  	processMetrics.Add(prometheus.NewFloatData(&prometheus.ProcessStartTimeSeconds, float64(m.startTime.Unix())+(float64(m.startTime.Nanosecond())/1e9)))
   722  	snapshotsToOptions[processMetrics] = prometheus.SnapshotExportOptions{
   723  		// These metrics must be written without any prefix.
   724  	}
   725  
   726  	// Aggregate all the snapshots from the sandboxes.
   727  	close(snapshotCh)
   728  	for snapshotAndOptions := range snapshotCh {
   729  		snapshotsToOptions[snapshotAndOptions.snapshot] = snapshotAndOptions.options
   730  	}
   731  
   732  	// Add our own metrics.
   733  	selfMetrics.Add(prometheus.NewIntData(&NumRunningSandboxesMetric, meta.numRunningSandboxes))
   734  	selfMetrics.Add(prometheus.NewIntData(&NumCannotExportSandboxesMetric, meta.numCannotExportSandboxes))
   735  	selfMetrics.Add(prometheus.NewIntData(&NumTotalSandboxesMetric, numSandboxesTotal))
   736  
   737  	// Write out all data.
   738  	lastMetricsWrittenSize := int(m.lastMetricsWrittenSize.Load())
   739  	metricsWritten := make(map[string]bool, lastMetricsWrittenSize)
   740  	commentHeader := fmt.Sprintf("Data for runsc metric server exporting data for sandboxes in root directory %s", m.rootDir)
   741  	if metricsFilter != "" {
   742  		commentHeader = fmt.Sprintf("%s (filtered using regular expression: %q)", commentHeader, metricsFilter)
   743  	}
   744  	promWriter := m.promWriterPool.Get().(*prometheus.ReusableWriter[*httpResponseWriter])
   745  	written, err := promWriter.Write(w, prometheus.ExportOptions{
   746  		CommentHeader:  commentHeader,
   747  		MetricsWritten: metricsWritten,
   748  	}, snapshotsToOptions)
   749  	m.promWriterPool.Put(promWriter)
   750  	if err != nil {
   751  		if written == 0 {
   752  			return httpResult{http.StatusServiceUnavailable, err}
   753  		}
   754  		// Note that we cannot return an HTTP error here because we have already started writing a
   755  		// response, which means we've already responded with a 200 OK status code.
   756  		// This probably means the client closed the connection before we could finish writing.
   757  		return httpOK
   758  	}
   759  	if lastMetricsWrittenSize < len(metricsWritten) {
   760  		m.lastMetricsWrittenSize.CompareAndSwap(uint32(lastMetricsWrittenSize), uint32(len(metricsWritten)))
   761  	}
   762  	return httpOK
   763  }
   764  
   765  // serveHealthCheck serves the healthcheck endpoint.
   766  // Returns a response prefixed by "runsc-metrics:OK" on success.
   767  // Clients can use this to assert that they are talking to the metrics server, as opposed to some
   768  // other random HTTP server.
   769  func (m *metricServer) serveHealthCheck(w *httpResponseWriter, req *http.Request) httpResult {
   770  	m.mu.Lock()
   771  	defer m.mu.Unlock()
   772  	if m.shuttingDown {
   773  		return httpResult{http.StatusServiceUnavailable, errors.New("server is shutting down")}
   774  	}
   775  	if err := req.ParseForm(); err != nil {
   776  		return httpResult{http.StatusBadRequest, err}
   777  	}
   778  	rootDir := req.Form.Get("root")
   779  	if rootDir != m.rootDir {
   780  		return httpResult{http.StatusBadRequest, fmt.Errorf("this metric server is configured to serve root directory: %s", m.rootDir)}
   781  	}
   782  	w.WriteHeader(http.StatusOK)
   783  	w.WriteString("runsc-metrics:OK")
   784  	return httpOK
   785  }
   786  
   787  // servePID serves the PID of the metric server process.
   788  func (m *metricServer) servePID(w *httpResponseWriter, req *http.Request) httpResult {
   789  	m.mu.Lock()
   790  	defer m.mu.Unlock()
   791  	if m.shuttingDown {
   792  		return httpResult{http.StatusServiceUnavailable, errors.New("server is shutting down")}
   793  	}
   794  	w.WriteString(strconv.Itoa(m.pid))
   795  	return httpOK
   796  }
   797  
   798  // Server is the set of options to run a metric server.
   799  // Initialize this struct and then call Run on it to run the metric server.
   800  type Server struct {
   801  	// Config is the main runsc configuration.
   802  	Config *config.Config
   803  
   804  	// ExporterPrefix is used as prefix for all metric names following Prometheus exporter convention.
   805  	ExporterPrefix string
   806  
   807  	// PIDFile, if set, will cause the metric server to write its own PID to this file after binding
   808  	// to the requested address. The parent directory of this file must already exist.
   809  	PIDFile string
   810  
   811  	// ExposeProfileEndpoints, if true, exposes /runsc-metrics/profile-cpu and
   812  	// /runsc-metrics/profile-heap to get profiling data about the metric server.
   813  	ExposeProfileEndpoints bool
   814  
   815  	// AllowUnknownRoot causes the metric server to keep running regardless of the existence of the
   816  	// Config's root directory or the metric server's ability to access it.
   817  	AllowUnknownRoot bool
   818  }
   819  
   820  // Run runs the metric server.
   821  // It blocks until the server is instructed to exit, e.g. via signal.
   822  func (s *Server) Run(ctx context.Context) error {
   823  	ctx, ctxCancel := context.WithCancel(ctx)
   824  	defer ctxCancel()
   825  
   826  	m := &metricServer{
   827  		exporterPrefix:         s.ExporterPrefix,
   828  		pidFile:                s.PIDFile,
   829  		exposeProfileEndpoints: s.ExposeProfileEndpoints,
   830  		allowUnknownRoot:       s.AllowUnknownRoot,
   831  		promWriterPool: sync.Pool{
   832  			New: func() any {
   833  				return &prometheus.ReusableWriter[*httpResponseWriter]{}
   834  			},
   835  		},
   836  	}
   837  	conf := s.Config
   838  	if conf.MetricServer == "" {
   839  		return errors.New("config does not specify the metric server address (--metric-server)")
   840  	}
   841  	if strings.Contains(conf.MetricServer, "%ID%") {
   842  		return fmt.Errorf("metric server address contains '%%ID%%': %v; this should have been replaced by the parent process", conf.MetricServer)
   843  	}
   844  	if _, err := container.ListSandboxes(conf.RootDir); err != nil {
   845  		if !m.allowUnknownRoot {
   846  			return fmt.Errorf("invalid root directory %q: tried to list sandboxes within it and got: %w", conf.RootDir, err)
   847  		}
   848  		log.Infof("Root directory %q: tried to list sandboxes within it and got: %v. Continuing anyway, as this is expected with --allow-unknown-root.", conf.RootDir, err)
   849  	}
   850  	// container.ListSandboxes uses a glob pattern, which doesn't error out on
   851  	// permission errors. Double-check by actually listing the directory.
   852  	if _, err := ioutil.ReadDir(conf.RootDir); err != nil {
   853  		if !m.allowUnknownRoot {
   854  			return fmt.Errorf("invalid root directory %q: tried to list all entries within it and got: %w", conf.RootDir, err)
   855  		}
   856  		log.Infof("Root directory %q: tried to list all entries within it and got: %v. Continuing anyway, as this is expected with --allow-unknown-root.", conf.RootDir, err)
   857  	}
   858  	m.startTime = time.Now()
   859  	m.rootDir = conf.RootDir
   860  	if strings.Contains(conf.MetricServer, "%RUNTIME_ROOT%") {
   861  		newAddr := strings.ReplaceAll(conf.MetricServer, "%RUNTIME_ROOT%", m.rootDir)
   862  		log.Infof("Metric server address replaced %RUNTIME_ROOT%: %q -> %q", conf.MetricServer, newAddr)
   863  		conf.MetricServer = newAddr
   864  	}
   865  	m.address = conf.MetricServer
   866  	m.sandboxes = make(map[container.FullID]*servedSandbox)
   867  	m.lastStateFileStat = make(map[container.FullID]os.FileInfo)
   868  	m.pid = os.Getpid()
   869  	m.shutdownCh = make(chan os.Signal, 1)
   870  	signal.Notify(m.shutdownCh, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
   871  
   872  	var listener net.Listener
   873  	var listenErr error
   874  	if strings.HasPrefix(conf.MetricServer, fmt.Sprintf("%c", os.PathSeparator)) {
   875  		beforeBindSt, beforeBindErr := os.Stat(conf.MetricServer)
   876  		if listener, listenErr = (&net.ListenConfig{}).Listen(ctx, "unix", conf.MetricServer); listenErr != nil {
   877  			return fmt.Errorf("cannot listen on unix domain socket %q: %w", conf.MetricServer, listenErr)
   878  		}
   879  		afterBindSt, afterBindErr := os.Stat(conf.MetricServer)
   880  		if afterBindErr != nil {
   881  			return fmt.Errorf("cannot stat our own unix domain socket %q: %w", conf.MetricServer, afterBindErr)
   882  		}
   883  		ownUDS := true
   884  		if beforeBindErr == nil && beforeBindSt.Mode() == afterBindSt.Mode() {
   885  			// Socket file existed and was a socket prior to us binding to it.
   886  			if beforeBindSt.Sys() != nil && afterBindSt.Sys() != nil {
   887  				beforeSt, beforeStOk := beforeBindSt.Sys().(*syscall.Stat_t)
   888  				afterSt, afterStOk := beforeBindSt.Sys().(*syscall.Stat_t)
   889  				if beforeStOk && afterStOk && beforeSt.Dev == afterSt.Dev && beforeSt.Ino == afterSt.Ino {
   890  					// Socket file is the same before and after binding, so we should not consider ourselves
   891  					// the owner of it.
   892  					ownUDS = false
   893  				}
   894  			}
   895  		}
   896  		if ownUDS {
   897  			log.Infof("Bound on socket file %s which we own. As such, this socket file will be deleted on server shutdown.", conf.MetricServer)
   898  			m.udsPath = conf.MetricServer
   899  			defer os.Remove(m.udsPath)
   900  			os.Chmod(m.udsPath, 0777)
   901  		} else {
   902  			log.Infof("Bound on socket file %s which existed prior to this server's existence. As such, it will not be deleted on server shutdown.", conf.MetricServer)
   903  		}
   904  	} else {
   905  		if strings.HasPrefix(conf.MetricServer, ":") {
   906  			log.Warningf("Binding on all interfaces. This will allow anyone to list all containers on your machine!")
   907  		}
   908  		if listener, listenErr = (&net.ListenConfig{}).Listen(ctx, "tcp", conf.MetricServer); listenErr != nil {
   909  			return fmt.Errorf("cannot listen on TCP address %q: %w", conf.MetricServer, listenErr)
   910  		}
   911  	}
   912  
   913  	mux := http.NewServeMux()
   914  	mux.HandleFunc("/runsc-metrics/healthcheck", logRequest(m.serveHealthCheck))
   915  	mux.HandleFunc("/runsc-metrics/pid", logRequest(m.servePID))
   916  	if m.exposeProfileEndpoints {
   917  		log.Warningf("Profiling HTTP endpoints are exposed; this should only be used for development!")
   918  		mux.HandleFunc("/runsc-metrics/profile-cpu", logRequest(m.profileCPU))
   919  		mux.HandleFunc("/runsc-metrics/profile-heap", logRequest(m.profileHeap))
   920  	} else {
   921  		// Disable memory profiling, since we don't expose it.
   922  		runtime.MemProfileRate = 0
   923  	}
   924  	mux.HandleFunc("/metrics", logRequest(m.serveMetrics))
   925  	mux.HandleFunc("/", logRequest(m.serveIndex))
   926  	m.srv.Handler = mux
   927  	m.srv.ReadTimeout = httpTimeout
   928  	m.srv.WriteTimeout = httpTimeout
   929  	if err := m.startVerifyLoop(ctx); err != nil {
   930  		return fmt.Errorf("cannot start background loop: %w", err)
   931  	}
   932  	if m.pidFile != "" {
   933  		if err := ioutil.WriteFile(m.pidFile, []byte(fmt.Sprintf("%d", m.pid)), 0644); err != nil {
   934  			return fmt.Errorf("cannot write PID to file %q: %w", m.pidFile, err)
   935  		}
   936  		defer os.Remove(m.pidFile)
   937  		log.Infof("Wrote PID %d to file %v.", m.pid, m.pidFile)
   938  	}
   939  
   940  	// If not modified by the user from the environment, set the Go GC percentage lower than default.
   941  	if _, hasEnv := os.LookupEnv("GOGC"); !hasEnv {
   942  		debug.SetGCPercent(40)
   943  	}
   944  
   945  	// Run GC immediately to get rid of all the initialization-related memory bloat and start from
   946  	// a clean slate.
   947  	state.Release()
   948  	runtime.GC()
   949  
   950  	// Initialization complete.
   951  	log.Infof("Server serving on %s for root directory %s.", conf.MetricServer, conf.RootDir)
   952  	serveErr := m.srv.Serve(listener)
   953  	log.Infof("Server has stopped accepting requests.")
   954  	m.mu.Lock()
   955  	defer m.mu.Unlock()
   956  	if serveErr != nil {
   957  		if serveErr == http.ErrServerClosed {
   958  			return nil
   959  		}
   960  		return fmt.Errorf("cannot serve on address %s: %w", conf.MetricServer, serveErr)
   961  	}
   962  	// Per documentation, http.Server.Serve can never return a nil error, so this is not a success.
   963  	return fmt.Errorf("HTTP server Serve() did not return expected error")
   964  }