github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/runsc/metricserver/metricserver.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package metricserver implements a Prometheus metric server for runsc data.
    16  package metricserver
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"fmt"
    22  	"io"
    23  	"io/ioutil"
    24  	"math/rand"
    25  	"net"
    26  	"net/http"
    27  	"os"
    28  	"os/signal"
    29  	"regexp"
    30  	"runtime"
    31  	"runtime/debug"
    32  	"strconv"
    33  	"strings"
    34  	"syscall"
    35  	"time"
    36  
    37  	"github.com/ttpreport/gvisor-ligolo/pkg/abi/linux"
    38  	"github.com/ttpreport/gvisor-ligolo/pkg/atomicbitops"
    39  	"github.com/ttpreport/gvisor-ligolo/pkg/log"
    40  	"github.com/ttpreport/gvisor-ligolo/pkg/prometheus"
    41  	"github.com/ttpreport/gvisor-ligolo/pkg/sentry/control"
    42  	"github.com/ttpreport/gvisor-ligolo/pkg/state"
    43  	"github.com/ttpreport/gvisor-ligolo/pkg/sync"
    44  	"github.com/ttpreport/gvisor-ligolo/runsc/config"
    45  	"github.com/ttpreport/gvisor-ligolo/runsc/container"
    46  	"github.com/ttpreport/gvisor-ligolo/runsc/sandbox"
    47  )
    48  
    49  const (
    50  	// metricsExportTimeout is the maximum amount of time that the metrics export process should take.
    51  	metricsExportTimeout = 30 * time.Second
    52  
    53  	// metricsExportPerSandboxTimeout is the maximum amount of time that we wait on any individual
    54  	// sandbox when exporting its metrics.
    55  	metricsExportPerSandboxTimeout = 8 * time.Second
    56  
    57  	// exportParallelGoroutines is the maximum number of goroutines spawned during metrics export.
    58  	exportParallelGoroutines = 8
    59  )
    60  
    61  // servedSandbox is a sandbox that we serve metrics from.
    62  // A single metrics server will export data about multiple sandboxes.
    63  type servedSandbox struct {
    64  	rootContainerID container.FullID
    65  	server          *metricServer
    66  	extraLabels     map[string]string
    67  
    68  	// mu protects the fields below.
    69  	mu sync.Mutex
    70  
    71  	// sandbox is the sandbox being monitored.
    72  	// Once set, it is immutable.
    73  	sandbox *sandbox.Sandbox
    74  
    75  	// createdAt stores the time the sandbox was created.
    76  	// It is loaded from the container state file.
    77  	// Once set, it is immutable.
    78  	createdAt time.Time
    79  
    80  	// capabilities is the union of the capability set of the containers within `sandbox`.
    81  	// It is used to export a per-sandbox metric representing which capabilities are in use.
    82  	// For monitoring purposes, a capability added in a container means it is considered
    83  	// added for the whole sandbox.
    84  	capabilities []linux.Capability
    85  
    86  	// specMetadataLabels is the set of label exported as part of the
    87  	// `spec_metadata` metric.
    88  	specMetadataLabels map[string]string
    89  
    90  	// verifier allows verifying the data integrity of the metrics we get from this sandbox.
    91  	// It is not always initialized when the sandbox is discovered, but rather upon first metrics
    92  	// access to the sandbox. Metric registration data is loaded from the root container's
    93  	// state file.
    94  	// The server needs to load this registration data before any data from this sandbox is
    95  	// served to HTTP clients. If there is no metric registration data within the Container
    96  	// data, then metrics were not requested for this sandbox, and this servedSandbox should
    97  	// be deleted from the server.
    98  	// Once set, it is immutable.
    99  	verifier *prometheus.Verifier
   100  
   101  	// cleanupVerifier holds a reference to the cleanup function of the verifier.
   102  	cleanupVerifier func()
   103  
   104  	// extra contains additional per-sandbox data.
   105  	extra sandboxData
   106  }
   107  
   108  // load loads the sandbox being monitored and initializes its metric verifier.
   109  // If it returns an error other than container.ErrStateFileLocked, the sandbox is either
   110  // non-existent, or has not requested instrumentation to be enabled, or does not have
   111  // valid metric registration data. In any of these cases, the sandbox should be removed
   112  // from this metrics server.
   113  func (s *servedSandbox) load() (*sandbox.Sandbox, *prometheus.Verifier, error) {
   114  	s.mu.Lock()
   115  	defer s.mu.Unlock()
   116  	if s.sandbox == nil {
   117  		allContainers, err := container.LoadSandbox(s.server.rootDir, s.rootContainerID.SandboxID, container.LoadOpts{
   118  			TryLock: container.TryAcquire,
   119  		})
   120  		if err != nil {
   121  			return nil, nil, fmt.Errorf("cannot load sandbox %q: %v", s.rootContainerID.SandboxID, err)
   122  		}
   123  		var rootContainer *container.Container
   124  		for _, cont := range allContainers {
   125  			if cont.IsSandboxRoot() {
   126  				if rootContainer != nil {
   127  					return nil, nil, fmt.Errorf("multiple root contains found for sandbox ID %q: %v and %v", s.rootContainerID.SandboxID, cont, rootContainer)
   128  				}
   129  				rootContainer = cont
   130  			}
   131  		}
   132  		if rootContainer == nil {
   133  			return nil, nil, fmt.Errorf("no root container found for sandbox ID %q", s.rootContainerID.SandboxID)
   134  		}
   135  		sandboxMetricAddr := strings.ReplaceAll(rootContainer.Sandbox.MetricServerAddress, "%RUNTIME_ROOT%", s.server.rootDir)
   136  		if sandboxMetricAddr == "" {
   137  			return nil, nil, errors.New("sandbox did not request instrumentation")
   138  		}
   139  		if sandboxMetricAddr != s.server.address {
   140  			return nil, nil, fmt.Errorf("sandbox requested instrumentation by a metric server running at a different address (sandbox wants %q, this metric server serves %q)", sandboxMetricAddr, s.server.address)
   141  		}
   142  		// Update label data as read from the state file.
   143  		// Do not store empty labels.
   144  		authoritativeLabels, err := SandboxPrometheusLabels(rootContainer)
   145  		if err != nil {
   146  			return nil, nil, fmt.Errorf("cannot compute Prometheus labels of sandbox: %v", err)
   147  		}
   148  		s.extraLabels = make(map[string]string, len(authoritativeLabels))
   149  		for _, label := range []string{
   150  			prometheus.SandboxIDLabel,
   151  			prometheus.IterationIDLabel,
   152  			prometheus.PodNameLabel,
   153  			prometheus.NamespaceLabel,
   154  		} {
   155  			s.extraLabels[label] = authoritativeLabels[label]
   156  			if s.extraLabels[label] == "" {
   157  				delete(s.extraLabels, label)
   158  			}
   159  		}
   160  
   161  		// Compute capability set.
   162  		allCaps := linux.AllCapabilities()
   163  		capSet := make([]linux.Capability, 0, len(allCaps))
   164  		for _, cap := range allCaps {
   165  			for _, cont := range allContainers {
   166  				if cont.HasCapabilityInAnySet(cap) {
   167  					capSet = append(capSet, cap)
   168  					break
   169  				}
   170  			}
   171  		}
   172  		if len(capSet) > 0 {
   173  			// Reallocate a slice with minimum size, since it will be long-lived.
   174  			s.capabilities = make([]linux.Capability, len(capSet))
   175  			for i, capLabels := range capSet {
   176  				s.capabilities[i] = capLabels
   177  			}
   178  		}
   179  
   180  		// Compute spec metadata.
   181  		s.specMetadataLabels = ComputeSpecMetadata(allContainers)
   182  
   183  		s.sandbox = rootContainer.Sandbox
   184  		s.createdAt = rootContainer.CreatedAt
   185  	}
   186  	if s.verifier == nil {
   187  		registeredMetrics, err := s.sandbox.GetRegisteredMetrics()
   188  		if err != nil {
   189  			return nil, nil, err
   190  		}
   191  		verifier, cleanup, err := prometheus.NewVerifier(registeredMetrics)
   192  		if err != nil {
   193  			return nil, nil, err
   194  		}
   195  		s.verifier = verifier
   196  		s.cleanupVerifier = cleanup
   197  	}
   198  	if err := s.extra.load(s); err != nil {
   199  		return nil, nil, err
   200  	}
   201  	return s.sandbox, s.verifier, nil
   202  }
   203  
   204  func (s *servedSandbox) cleanup() {
   205  	s.mu.Lock()
   206  	defer s.mu.Unlock()
   207  	if s.cleanupVerifier != nil {
   208  		s.cleanupVerifier()
   209  	}
   210  }
   211  
   212  // querySandboxMetrics queries the sandbox for metrics data.
   213  func querySandboxMetrics(ctx context.Context, sand *sandbox.Sandbox, verifier *prometheus.Verifier, metricsFilter string) (*prometheus.Snapshot, error) {
   214  	ch := make(chan struct {
   215  		snapshot *prometheus.Snapshot
   216  		err      error
   217  	}, 1)
   218  	canceled := make(chan struct{}, 1)
   219  	defer close(canceled)
   220  	go func() {
   221  		snapshot, err := sand.ExportMetrics(control.MetricsExportOpts{
   222  			OnlyMetrics: metricsFilter,
   223  		})
   224  		select {
   225  		case <-canceled:
   226  		case ch <- struct {
   227  			snapshot *prometheus.Snapshot
   228  			err      error
   229  		}{snapshot, err}:
   230  			close(ch)
   231  		}
   232  	}()
   233  	select {
   234  	case <-ctx.Done():
   235  		canceled <- struct{}{}
   236  		return nil, ctx.Err()
   237  	case ret := <-ch:
   238  		if ret.err != nil {
   239  			return nil, ret.err
   240  		}
   241  		if err := verifier.Verify(ret.snapshot); err != nil {
   242  			return nil, err
   243  		}
   244  		return ret.snapshot, nil
   245  	}
   246  }
   247  
   248  // metricServer implements the metric server.
   249  type metricServer struct {
   250  	rootDir                string
   251  	pid                    int
   252  	pidFile                string
   253  	allowUnknownRoot       bool
   254  	exposeProfileEndpoints bool
   255  	address                string
   256  	exporterPrefix         string
   257  	startTime              time.Time
   258  	srv                    http.Server
   259  
   260  	// Size of the map of written metrics during the last /metrics export. Initially zero.
   261  	// Used to efficiently reallocate a map of the right size during the next export.
   262  	lastMetricsWrittenSize atomicbitops.Uint32
   263  
   264  	// mu protects the fields below.
   265  	mu sync.Mutex
   266  
   267  	// udsPath is a path to a Unix Domain Socket file on which the server is bound and which it owns.
   268  	// This socket file will be deleted on server shutdown.
   269  	// This field is not set if binding to a network port, or when the UDS already existed prior to
   270  	// being bound by us (i.e. its ownership isn't ours), such that it isn't deleted in this case.
   271  	// The field is unset once the file is successfully removed.
   272  	udsPath string
   273  
   274  	// sandboxes is the list of sandboxes we serve metrics for.
   275  	sandboxes map[container.FullID]*servedSandbox
   276  
   277  	// lastStateFileStat maps container full IDs to the last observed stat() of their state file.
   278  	// This is used to monitor for sandboxes in the background. If a sandbox's state file matches this
   279  	// info, we can assume that the last background scan already looked at it.
   280  	lastStateFileStat map[container.FullID]os.FileInfo
   281  
   282  	// lastValidMetricFilter stores the last value of the "runsc-sandbox-metrics-filter" parameter for
   283  	// /metrics requests.
   284  	// It represents the last-known compilable regular expression that was passed to /metrics.
   285  	// It is used to avoid re-verifying this parameter in the common case where a single scraper
   286  	// is consistently passing in the same value for this parameter in each successive request.
   287  	lastValidMetricFilter string
   288  
   289  	// lastValidCapabilityFilterStr stores the last value of the "runsc-capability-filter" parameter
   290  	// for /metrics requests.
   291  	// It represents the last-known compilable regular expression that was passed to /metrics.
   292  	// It is used to avoid re-verifying this parameter in the common case where a single scraper
   293  	// is consistently passing in the same value for this parameter in each successive request.
   294  	lastValidCapabilityFilterStr string
   295  
   296  	// lastValidCapabilityFilterReg is the compiled regular expression corresponding to
   297  	// lastValidCapabilityFilterStr.
   298  	lastValidCapabilityFilterReg *regexp.Regexp
   299  
   300  	// numSandboxes counts the number of sandboxes that have ever been registered on this server.
   301  	// Used to distinguish between the case where this metrics serve has sat there doing nothing
   302  	// because no sandbox ever registered against it (which is unexpected), vs the case where it has
   303  	// done a good job serving sandbox metrics and it's time for it to gracefully die as there are no
   304  	// more sandboxes to serve.
   305  	// Also exported as a metric of total number of sandboxes started.
   306  	numSandboxes int64
   307  
   308  	// shuttingDown is flipped to true when the server shutdown process has started.
   309  	// Used to deal with race conditions where a sandbox is trying to register after the server has
   310  	// already started to go to sleep.
   311  	shuttingDown bool
   312  
   313  	// shutdownCh is written to when receiving the signal to shut down gracefully.
   314  	shutdownCh chan os.Signal
   315  
   316  	// extraData contains additional server-wide data.
   317  	extra serverData
   318  }
   319  
   320  // sufficientlyEqualStats returns whether the given FileInfo's are sufficiently
   321  // equal to assume the file they represent has not changed between the time
   322  // each FileInfo was obtained.
   323  func sufficientlyEqualStats(s1, s2 os.FileInfo) bool {
   324  	if !s1.ModTime().Equal(s2.ModTime()) {
   325  		return false
   326  	}
   327  	if s1.Size() != s2.Size() {
   328  		return false
   329  	}
   330  	statT1, ok1 := s1.Sys().(*syscall.Stat_t)
   331  	statT2, ok2 := s2.Sys().(*syscall.Stat_t)
   332  	if ok1 != ok2 {
   333  		return false
   334  	}
   335  	if ok1 && ok2 {
   336  		if statT1.Dev != statT2.Dev {
   337  			return false
   338  		}
   339  		if statT1.Ino != statT2.Ino {
   340  			return false
   341  		}
   342  	}
   343  	return true
   344  }
   345  
   346  // refreshSandboxesLocked removes sandboxes that are no longer running from m.sandboxes, and
   347  // adds sandboxes found in the root directory that do request instrumentation.
   348  // Preconditions: m.mu is locked.
   349  func (m *metricServer) refreshSandboxesLocked() {
   350  	if m.shuttingDown {
   351  		// Do nothing to avoid log spam.
   352  		return
   353  	}
   354  	sandboxIDs, err := container.ListSandboxes(m.rootDir)
   355  	if err != nil {
   356  		if !m.allowUnknownRoot {
   357  			log.Warningf("Cannot list containers in root directory %s, it has likely gone away: %v.", m.rootDir, err)
   358  		}
   359  		return
   360  	}
   361  	for sandboxID, sandbox := range m.sandboxes {
   362  		found := false
   363  		for _, sid := range sandboxIDs {
   364  			if sid == sandboxID {
   365  				found = true
   366  				break
   367  			}
   368  		}
   369  		if !found {
   370  			log.Warningf("Sandbox %s no longer exists but did not explicitly unregister. Removing it.", sandboxID)
   371  			sandbox.cleanup()
   372  			delete(m.sandboxes, sandboxID)
   373  			continue
   374  		}
   375  		if _, _, err := sandbox.load(); err != nil && err != container.ErrStateFileLocked {
   376  			log.Warningf("Sandbox %s cannot be loaded, deleting it: %v", sandboxID, err)
   377  			sandbox.cleanup()
   378  			delete(m.sandboxes, sandboxID)
   379  			continue
   380  		}
   381  		if !sandbox.sandbox.IsRunning() {
   382  			log.Infof("Sandbox %s is no longer running, deleting it.", sandboxID)
   383  			sandbox.cleanup()
   384  			delete(m.sandboxes, sandboxID)
   385  			continue
   386  		}
   387  	}
   388  	newSandboxIDs := make(map[container.FullID]bool, len(sandboxIDs))
   389  	for _, sid := range sandboxIDs {
   390  		if _, found := m.sandboxes[sid]; found {
   391  			continue
   392  		}
   393  		newSandboxIDs[sid] = true
   394  	}
   395  	for sid := range m.lastStateFileStat {
   396  		if _, found := newSandboxIDs[sid]; !found {
   397  			delete(m.lastStateFileStat, sid)
   398  		}
   399  	}
   400  	for sid := range newSandboxIDs {
   401  		stateFile := container.StateFile{
   402  			RootDir: m.rootDir,
   403  			ID:      sid,
   404  		}
   405  		stat, err := stateFile.Stat()
   406  		if err != nil {
   407  			log.Warningf("Failed to stat() container state file for sandbox %q: %v", sid, err)
   408  			continue
   409  		}
   410  		if existing, found := m.lastStateFileStat[sid]; found {
   411  			// We already tried to stat this sandbox but decided not to pick it up.
   412  			// Check if the state file changed since. If it didn't, we don't want to
   413  			// try again.
   414  			if sufficientlyEqualStats(existing, stat) {
   415  				continue
   416  			}
   417  			log.Infof("State file for sandbox %q has changed since we last looked at it; will try to reload it.", sid)
   418  			delete(m.lastStateFileStat, sid)
   419  		}
   420  		// If we get here, we either haven't seen this sandbox before, or we saw it
   421  		// and it has disappeared (which means it is new in this iteration), or we
   422  		// saw it before but its state file changed. Either way, we want to try
   423  		// loading it and see if it wants instrumentation.
   424  		cont, err := container.Load(m.rootDir, sid, container.LoadOpts{
   425  			Exact:         true,
   426  			SkipCheck:     true,
   427  			TryLock:       container.TryAcquire,
   428  			RootContainer: true,
   429  		})
   430  		if err != nil {
   431  			if err == container.ErrStateFileLocked {
   432  				// This error is OK and shouldn't generate log spam. The sandbox is probably in the middle
   433  				// of being created.
   434  				continue
   435  			}
   436  			log.Warningf("Cannot load state file for sandbox %q: %v", sid, err)
   437  			continue
   438  		}
   439  
   440  		// This is redundant with one of the checks performed below in servedSandbox.load, but this
   441  		// avoids log spam for the non-error case of sandboxes that didn't request instrumentation.
   442  		sandboxMetricAddr := strings.ReplaceAll(cont.Sandbox.MetricServerAddress, "%RUNTIME_ROOT%", m.rootDir)
   443  		if sandboxMetricAddr != m.address {
   444  			m.lastStateFileStat[sid] = stat
   445  			continue
   446  		}
   447  
   448  		// This case can be hit when there is a leftover state file for a sandbox that was `kill -9`'d
   449  		// without an opportunity for it to clean up its state file. This results in a valid state file
   450  		// but the sandbox PID is gone. We don't want to continuously load this sandbox's state file.
   451  		if cont.Status == container.Running && !cont.Sandbox.IsRunning() {
   452  			log.Warningf("Sandbox %q has state file in state Running, yet it isn't actually running. Ignoring it.", sid)
   453  			m.lastStateFileStat[sid] = stat
   454  			continue
   455  		}
   456  
   457  		m.numSandboxes++
   458  		served := &servedSandbox{
   459  			rootContainerID: sid,
   460  			server:          m,
   461  			extraLabels: map[string]string{
   462  				prometheus.SandboxIDLabel: sid.SandboxID,
   463  			},
   464  		}
   465  		// Best-effort attempt to load the state file instantly.
   466  		// This may legitimately fail if it is locked, e.g. during sandbox startup.
   467  		// If it fails for any other reason, then the sandbox went away between the time we listed the
   468  		// sandboxes and now, so just delete it.
   469  		if _, _, err := served.load(); err != nil && err != container.ErrStateFileLocked {
   470  			log.Warningf("Sandbox %q cannot be loaded, ignoring it: %v", sid, err)
   471  			m.lastStateFileStat[sid] = stat
   472  			served.cleanup()
   473  			continue
   474  		}
   475  		m.sandboxes[sid] = served
   476  		log.Infof("Registered new sandbox found in root directory: %q", sid)
   477  	}
   478  }
   479  
   480  // sandboxLoadResult contains the outcome of calling `load` on a `servedSandbox`.
   481  // It is used as an intermediary type that contains all that we know about a
   482  // sandbox after attempting to load its state file, but does not contain any
   483  // metric data from the sandbox.
   484  type sandboxLoadResult struct {
   485  	served   *servedSandbox
   486  	sandbox  *sandbox.Sandbox
   487  	verifier *prometheus.Verifier
   488  	err      error
   489  }
   490  
   491  // loadSandboxesLocked loads the state file data from all known sandboxes.
   492  // It does so in parallel, and avoids reloading sandboxes for which we have
   493  // already loaded data.
   494  func (m *metricServer) loadSandboxesLocked(ctx context.Context) []sandboxLoadResult {
   495  	m.refreshSandboxesLocked()
   496  
   497  	numGoroutines := exportParallelGoroutines
   498  	numSandboxes := len(m.sandboxes)
   499  	if numSandboxes < numGoroutines {
   500  		numGoroutines = numSandboxes
   501  	}
   502  
   503  	// First, load all the sandboxes in parallel. We need to do this while m.mu is held.
   504  	loadSandboxCh := make(chan *servedSandbox, numSandboxes)
   505  	loadedSandboxesCh := make(chan sandboxLoadResult, numSandboxes)
   506  	loadedSandboxes := make([]sandboxLoadResult, 0, numSandboxes)
   507  	for i := 0; i < numGoroutines; i++ {
   508  		go func() {
   509  			for served := range loadSandboxCh {
   510  				sand, verifier, err := served.load()
   511  				loadedSandboxesCh <- sandboxLoadResult{served, sand, verifier, err}
   512  			}
   513  		}()
   514  	}
   515  	for _, sandbox := range m.sandboxes {
   516  		loadSandboxCh <- sandbox
   517  	}
   518  	close(loadSandboxCh)
   519  	for i := 0; i < numSandboxes; i++ {
   520  		loadedSandboxes = append(loadedSandboxes, <-loadedSandboxesCh)
   521  	}
   522  	close(loadedSandboxesCh)
   523  	return loadedSandboxes
   524  }
   525  
   526  // sandboxMetricsResult is the result of calling querySandboxMetrics on a
   527  // single sandbox. It contains all of `sandboxLoadResult` but also has current
   528  // metric data (if querying metrics from the sandbox process succeeded).
   529  type sandboxMetricsResult struct {
   530  	sandboxLoadResult
   531  	isRunning bool
   532  	snapshot  *prometheus.Snapshot
   533  	err       error
   534  }
   535  
   536  // queryMultiSandboxMetrics queries metric data from multiple loaded sandboxes.
   537  // It does so in parallel and with random permutation ordering.
   538  // Only metrics matching the `metricsFilter` regular expression are queried.
   539  // For each sandbox, whether we were successful in querying its metrics or
   540  // not, the `processSandbox` function is called. This may be done in parallel,
   541  // so `processSandbox` should do its own locking so that multiple parallel
   542  // instances of itself behave appropriately.
   543  func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoadResult, metricsFilter string, processSandbox func(sandboxMetricsResult)) {
   544  	numSandboxes := len(loadedSandboxes)
   545  	ctxDeadline, ok := ctx.Deadline()
   546  	if !ok {
   547  		panic("context had no deadline, this should never happen as it was created with a timeout")
   548  	}
   549  	exportStartTime := time.Now()
   550  	requestTimeLeft := ctxDeadline.Sub(exportStartTime)
   551  	perSandboxTime := requestTimeLeft
   552  	if numSandboxes != 0 {
   553  		perSandboxTime = requestTimeLeft / time.Duration(numSandboxes)
   554  	}
   555  	if perSandboxTime < metricsExportPerSandboxTimeout {
   556  		perSandboxTime = metricsExportPerSandboxTimeout
   557  	}
   558  	loadedSandboxCh := make(chan sandboxLoadResult, numSandboxes)
   559  	var wg sync.WaitGroup
   560  	numGoroutines := exportParallelGoroutines
   561  	if numSandboxes < numGoroutines {
   562  		numGoroutines = numSandboxes
   563  	}
   564  	wg.Add(numGoroutines)
   565  	for i := 0; i < numGoroutines; i++ {
   566  		go func() {
   567  			defer wg.Done()
   568  			for s := range loadedSandboxCh {
   569  				isRunning := false
   570  				var snapshot *prometheus.Snapshot
   571  				err := s.err
   572  				if err == nil {
   573  					queryCtx, queryCtxCancel := context.WithTimeout(ctx, perSandboxTime)
   574  					snapshot, err = querySandboxMetrics(queryCtx, s.sandbox, s.verifier, metricsFilter)
   575  					queryCtxCancel()
   576  					isRunning = s.sandbox.IsRunning()
   577  				}
   578  				processSandbox(sandboxMetricsResult{
   579  					sandboxLoadResult: s,
   580  					isRunning:         isRunning,
   581  					snapshot:          snapshot,
   582  					err:               err,
   583  				})
   584  			}
   585  		}()
   586  	}
   587  	// Iterate over all sandboxes.
   588  	// Important: This must be done in random order.
   589  	// A malicious/compromised sandbox may decide to stall when being asked for metrics.
   590  	// If at least `numGoroutines` sandboxes do this, this will starve other sandboxes
   591  	// from having their metrics exported, because all the goroutines will be stuck on
   592  	// the stalled sandboxes.
   593  	// One way to completely avoid this would be to spawn one goroutine per
   594  	// sandbox, but this can amount to ~hundreds of goroutines, which is not desirable
   595  	// for the metrics server.
   596  	// Another way would be to have a very strict timeout on each sandbox's export
   597  	// process, but in some cases a busy sandbox will take more than a decisecond
   598  	// or so to export its data, so this would miss some data from legitimate (but
   599  	// slow) sandboxes.
   600  	// Instead, we take a middle-of-the-road approach: we use a timeout that's not
   601  	// too strict but still ensures we make forward progress away from stalled
   602  	// sandboxes, and we also iterate across sandboxes in a different random order at
   603  	// each export. This ensures that all sandboxes eventually get a fair chance of
   604  	// being part of the "first `numGoroutines` sandboxes in line" to get their
   605  	// metric data loaded, such that a client repeatedly scraping metrics will
   606  	// eventually get data from each sandbox.
   607  	for _, sandboxIndex := range rand.Perm(len(loadedSandboxes)) {
   608  		loadedSandboxCh <- loadedSandboxes[sandboxIndex]
   609  	}
   610  	close(loadedSandboxCh)
   611  	wg.Wait()
   612  }
   613  
   614  // serveMetrics serves metrics requests.
   615  func (m *metricServer) serveMetrics(w http.ResponseWriter, req *http.Request) httpResult {
   616  	ctx, ctxCancel := context.WithTimeout(req.Context(), metricsExportTimeout)
   617  	defer ctxCancel()
   618  
   619  	metricsFilter := req.URL.Query().Get("runsc-sandbox-metrics-filter")
   620  	var capabilityFilterReg *regexp.Regexp
   621  	capabilityFilterStr := req.URL.Query().Get("runsc-capability-filter")
   622  
   623  	m.mu.Lock()
   624  
   625  	if metricsFilter != "" && metricsFilter != m.lastValidMetricFilter {
   626  		_, err := regexp.Compile(metricsFilter)
   627  		if err != nil {
   628  			m.mu.Unlock()
   629  			return httpResult{http.StatusBadRequest, errors.New("provided metric filter is not a valid regular expression")}
   630  		}
   631  		m.lastValidMetricFilter = metricsFilter
   632  	}
   633  	if capabilityFilterStr != "" {
   634  		if capabilityFilterStr != m.lastValidCapabilityFilterStr {
   635  			reg, err := regexp.Compile(capabilityFilterStr)
   636  			if err != nil {
   637  				m.mu.Unlock()
   638  				return httpResult{http.StatusBadRequest, errors.New("provided capability filter is not a valid regular expression")}
   639  			}
   640  			m.lastValidCapabilityFilterStr = capabilityFilterStr
   641  			m.lastValidCapabilityFilterReg = reg
   642  			capabilityFilterReg = reg
   643  		} else {
   644  			capabilityFilterReg = m.lastValidCapabilityFilterReg
   645  		}
   646  	}
   647  
   648  	loadedSandboxes := m.loadSandboxesLocked(ctx)
   649  	numSandboxes := len(loadedSandboxes)
   650  	numSandboxesTotal := m.numSandboxes
   651  	m.mu.Unlock()
   652  
   653  	// Used to prevent goroutines from accessing the shared variables below.
   654  	var metricsMu sync.Mutex
   655  
   656  	// Meta-metrics keep track of metrics to export about the metrics server itself.
   657  	type metaMetrics struct {
   658  		numRunningSandboxes      int64
   659  		numCannotExportSandboxes int64
   660  	}
   661  	meta := metaMetrics{}                   // Protected by metricsMu.
   662  	selfMetrics := prometheus.NewSnapshot() // Protected by metricsMu.
   663  
   664  	type snapshotAndOptions struct {
   665  		snapshot *prometheus.Snapshot
   666  		options  prometheus.SnapshotExportOptions
   667  	}
   668  	snapshotCh := make(chan snapshotAndOptions, numSandboxes)
   669  
   670  	queryMultiSandboxMetrics(ctx, loadedSandboxes, metricsFilter, func(r sandboxMetricsResult) {
   671  		metricsMu.Lock()
   672  		defer metricsMu.Unlock()
   673  		selfMetrics.Add(prometheus.LabeledIntData(&SandboxPresenceMetric, nil, 1).SetExternalLabels(r.served.extraLabels))
   674  		sandboxRunning := int64(0)
   675  		if r.isRunning {
   676  			sandboxRunning = 1
   677  			meta.numRunningSandboxes++
   678  		}
   679  		selfMetrics.Add(prometheus.LabeledIntData(&SandboxRunningMetric, nil, sandboxRunning).SetExternalLabels(r.served.extraLabels))
   680  		if r.err == nil {
   681  			selfMetrics.Add(prometheus.LabeledIntData(&SandboxMetadataMetric, r.sandbox.MetricMetadata, 1).SetExternalLabels(r.served.extraLabels))
   682  			for _, cap := range r.served.capabilities {
   683  				if capabilityFilterReg != nil && !capabilityFilterReg.MatchString(cap.String()) && !capabilityFilterReg.MatchString(cap.TrimmedString()) {
   684  					continue
   685  				}
   686  				selfMetrics.Add(prometheus.LabeledIntData(&SandboxCapabilitiesMetric, map[string]string{
   687  					SandboxCapabilitiesMetricLabel: cap.TrimmedString(),
   688  				}, 1).SetExternalLabels(r.served.extraLabels))
   689  			}
   690  			selfMetrics.Add(prometheus.LabeledIntData(&SpecMetadataMetric, r.served.specMetadataLabels, 1).SetExternalLabels(r.served.extraLabels))
   691  			createdAt := float64(r.served.createdAt.Unix()) + (float64(r.served.createdAt.Nanosecond()) / 1e9)
   692  			selfMetrics.Add(prometheus.LabeledFloatData(&SandboxCreationMetric, nil, createdAt).SetExternalLabels(r.served.extraLabels))
   693  		} else {
   694  			// If the sandbox isn't running, it is normal that metrics are not exported for it, so
   695  			// do not report this case as an error.
   696  			if r.isRunning {
   697  				meta.numCannotExportSandboxes++
   698  				log.Warningf("Could not export metrics from sandbox %s: %v", r.served.rootContainerID.SandboxID, r.err)
   699  			}
   700  			return
   701  		}
   702  		snapshotCh <- snapshotAndOptions{
   703  			snapshot: r.snapshot,
   704  			options: prometheus.SnapshotExportOptions{
   705  				ExporterPrefix: m.exporterPrefix,
   706  				ExtraLabels:    r.served.extraLabels,
   707  			},
   708  		}
   709  	})
   710  
   711  	// Build the map of all snapshots we will be rendering.
   712  	snapshotsToOptions := make(map[*prometheus.Snapshot]prometheus.SnapshotExportOptions, numSandboxes+2)
   713  	snapshotsToOptions[selfMetrics] = prometheus.SnapshotExportOptions{
   714  		ExporterPrefix: fmt.Sprintf("%s%s", m.exporterPrefix, prometheus.MetaMetricPrefix),
   715  	}
   716  	processMetrics := prometheus.NewSnapshot()
   717  	processMetrics.Add(prometheus.NewFloatData(&prometheus.ProcessStartTimeSeconds, float64(m.startTime.Unix())+(float64(m.startTime.Nanosecond())/1e9)))
   718  	snapshotsToOptions[processMetrics] = prometheus.SnapshotExportOptions{
   719  		// These metrics must be written without any prefix.
   720  	}
   721  
   722  	// Aggregate all the snapshots from the sandboxes.
   723  	close(snapshotCh)
   724  	for snapshotAndOptions := range snapshotCh {
   725  		snapshotsToOptions[snapshotAndOptions.snapshot] = snapshotAndOptions.options
   726  	}
   727  
   728  	// Add our own metrics.
   729  	selfMetrics.Add(prometheus.NewIntData(&NumRunningSandboxesMetric, meta.numRunningSandboxes))
   730  	selfMetrics.Add(prometheus.NewIntData(&NumCannotExportSandboxesMetric, meta.numCannotExportSandboxes))
   731  	selfMetrics.Add(prometheus.NewIntData(&NumTotalSandboxesMetric, numSandboxesTotal))
   732  
   733  	// Write out all data.
   734  	lastMetricsWrittenSize := int(m.lastMetricsWrittenSize.Load())
   735  	metricsWritten := make(map[string]bool, lastMetricsWrittenSize)
   736  	commentHeader := fmt.Sprintf("Data for runsc metric server exporting data for sandboxes in root directory %s", m.rootDir)
   737  	if metricsFilter != "" {
   738  		commentHeader = fmt.Sprintf("%s (filtered using regular expression: %q)", commentHeader, metricsFilter)
   739  	}
   740  	written, err := prometheus.Write(w, prometheus.ExportOptions{
   741  		CommentHeader:  commentHeader,
   742  		MetricsWritten: metricsWritten,
   743  	}, snapshotsToOptions)
   744  	if err != nil {
   745  		if written == 0 {
   746  			return httpResult{http.StatusServiceUnavailable, err}
   747  		}
   748  		// Note that we cannot return an HTTP error here because we have already started writing a
   749  		// response, which means we've already responded with a 200 OK status code.
   750  		// This probably means the client closed the connection before we could finish writing.
   751  		return httpOK
   752  	}
   753  	if lastMetricsWrittenSize < len(metricsWritten) {
   754  		m.lastMetricsWrittenSize.CompareAndSwap(uint32(lastMetricsWrittenSize), uint32(len(metricsWritten)))
   755  	}
   756  	return httpOK
   757  }
   758  
   759  // serveHealthCheck serves the healthcheck endpoint.
   760  // Returns a response prefixed by "runsc-metrics:OK" on success.
   761  // Clients can use this to assert that they are talking to the metrics server, as opposed to some
   762  // other random HTTP server.
   763  func (m *metricServer) serveHealthCheck(w http.ResponseWriter, req *http.Request) httpResult {
   764  	m.mu.Lock()
   765  	defer m.mu.Unlock()
   766  	if m.shuttingDown {
   767  		return httpResult{http.StatusServiceUnavailable, errors.New("server is shutting down")}
   768  	}
   769  	if err := req.ParseForm(); err != nil {
   770  		return httpResult{http.StatusBadRequest, err}
   771  	}
   772  	rootDir := req.Form.Get("root")
   773  	if rootDir != m.rootDir {
   774  		return httpResult{http.StatusBadRequest, fmt.Errorf("this metric server is configured to serve root directory: %s", m.rootDir)}
   775  	}
   776  	w.WriteHeader(http.StatusOK)
   777  	io.WriteString(w, "runsc-metrics:OK")
   778  	return httpOK
   779  }
   780  
   781  // servePID serves the PID of the metric server process.
   782  func (m *metricServer) servePID(w http.ResponseWriter, req *http.Request) httpResult {
   783  	m.mu.Lock()
   784  	defer m.mu.Unlock()
   785  	if m.shuttingDown {
   786  		return httpResult{http.StatusServiceUnavailable, errors.New("server is shutting down")}
   787  	}
   788  	io.WriteString(w, strconv.Itoa(m.pid))
   789  	return httpOK
   790  }
   791  
   792  // Server is the set of options to run a metric server.
   793  // Initialize this struct and then call Run on it to run the metric server.
   794  type Server struct {
   795  	// Config is the main runsc configuration.
   796  	Config *config.Config
   797  
   798  	// ExporterPrefix is used as prefix for all metric names following Prometheus exporter convention.
   799  	ExporterPrefix string
   800  
   801  	// PIDFile, if set, will cause the metric server to write its own PID to this file after binding
   802  	// to the requested address. The parent directory of this file must already exist.
   803  	PIDFile string
   804  
   805  	// ExposeProfileEndpoints, if true, exposes /runsc-metrics/profile-cpu and
   806  	// /runsc-metrics/profile-heap to get profiling data about the metric server.
   807  	ExposeProfileEndpoints bool
   808  
   809  	// AllowUnknownRoot causes the metric server to keep running regardless of the existence of the
   810  	// Config's root directory or the metric server's ability to access it.
   811  	AllowUnknownRoot bool
   812  }
   813  
   814  // Run runs the metric server.
   815  // It blocks until the server is instructed to exit, e.g. via signal.
   816  func (s *Server) Run(ctx context.Context) error {
   817  	ctx, ctxCancel := context.WithCancel(ctx)
   818  	defer ctxCancel()
   819  
   820  	m := &metricServer{
   821  		exporterPrefix:         s.ExporterPrefix,
   822  		pidFile:                s.PIDFile,
   823  		exposeProfileEndpoints: s.ExposeProfileEndpoints,
   824  		allowUnknownRoot:       s.AllowUnknownRoot,
   825  	}
   826  	conf := s.Config
   827  	if conf.MetricServer == "" {
   828  		return errors.New("config does not specify the metric server address (--metric-server)")
   829  	}
   830  	if strings.Contains(conf.MetricServer, "%ID%") {
   831  		return fmt.Errorf("metric server address contains '%%ID%%': %v; this should have been replaced by the parent process", conf.MetricServer)
   832  	}
   833  	if _, err := container.ListSandboxes(conf.RootDir); err != nil {
   834  		if !m.allowUnknownRoot {
   835  			return fmt.Errorf("invalid root directory %q: tried to list sandboxes within it and got: %w", conf.RootDir, err)
   836  		}
   837  		log.Warningf("Invalid root directory %q: tried to list sandboxes within it and got: %v. Continuing anyway, as the server is configured to tolerate this.", conf.RootDir, err)
   838  	}
   839  	// container.ListSandboxes uses a glob pattern, which doesn't error out on
   840  	// permission errors. Double-check by actually listing the directory.
   841  	if _, err := ioutil.ReadDir(conf.RootDir); err != nil {
   842  		if !m.allowUnknownRoot {
   843  			return fmt.Errorf("invalid root directory %q: tried to list all entries within it and got: %w", conf.RootDir, err)
   844  		}
   845  		log.Warningf("Invalid root directory %q: tried to list all entries within it and got: %v. Continuing anyway, as the server is configured to tolerate this.", conf.RootDir, err)
   846  	}
   847  	m.startTime = time.Now()
   848  	m.rootDir = conf.RootDir
   849  	if strings.Contains(conf.MetricServer, "%RUNTIME_ROOT%") {
   850  		newAddr := strings.ReplaceAll(conf.MetricServer, "%RUNTIME_ROOT%", m.rootDir)
   851  		log.Infof("Metric server address replaced %RUNTIME_ROOT%: %q -> %q", conf.MetricServer, newAddr)
   852  		conf.MetricServer = newAddr
   853  	}
   854  	m.address = conf.MetricServer
   855  	m.sandboxes = make(map[container.FullID]*servedSandbox)
   856  	m.lastStateFileStat = make(map[container.FullID]os.FileInfo)
   857  	m.pid = os.Getpid()
   858  	m.shutdownCh = make(chan os.Signal, 1)
   859  	signal.Notify(m.shutdownCh, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
   860  
   861  	var listener net.Listener
   862  	var listenErr error
   863  	if strings.HasPrefix(conf.MetricServer, fmt.Sprintf("%c", os.PathSeparator)) {
   864  		beforeBindSt, beforeBindErr := os.Stat(conf.MetricServer)
   865  		if listener, listenErr = (&net.ListenConfig{}).Listen(ctx, "unix", conf.MetricServer); listenErr != nil {
   866  			return fmt.Errorf("cannot listen on unix domain socket %q: %w", conf.MetricServer, listenErr)
   867  		}
   868  		afterBindSt, afterBindErr := os.Stat(conf.MetricServer)
   869  		if afterBindErr != nil {
   870  			return fmt.Errorf("cannot stat our own unix domain socket %q: %w", conf.MetricServer, afterBindErr)
   871  		}
   872  		ownUDS := true
   873  		if beforeBindErr == nil && beforeBindSt.Mode() == afterBindSt.Mode() {
   874  			// Socket file existed and was a socket prior to us binding to it.
   875  			if beforeBindSt.Sys() != nil && afterBindSt.Sys() != nil {
   876  				beforeSt, beforeStOk := beforeBindSt.Sys().(*syscall.Stat_t)
   877  				afterSt, afterStOk := beforeBindSt.Sys().(*syscall.Stat_t)
   878  				if beforeStOk && afterStOk && beforeSt.Dev == afterSt.Dev && beforeSt.Ino == afterSt.Ino {
   879  					// Socket file is the same before and after binding, so we should not consider ourselves
   880  					// the owner of it.
   881  					ownUDS = false
   882  				}
   883  			}
   884  		}
   885  		if ownUDS {
   886  			log.Infof("Bound on socket file %s which we own. As such, this socket file will be deleted on server shutdown.", conf.MetricServer)
   887  			m.udsPath = conf.MetricServer
   888  			defer os.Remove(m.udsPath)
   889  			os.Chmod(m.udsPath, 0777)
   890  		} else {
   891  			log.Infof("Bound on socket file %s which existed prior to this server's existence. As such, it will not be deleted on server shutdown.", conf.MetricServer)
   892  		}
   893  	} else {
   894  		if strings.HasPrefix(conf.MetricServer, ":") {
   895  			log.Warningf("Binding on all interfaces. This will allow anyone to list all containers on your machine!")
   896  		}
   897  		if listener, listenErr = (&net.ListenConfig{}).Listen(ctx, "tcp", conf.MetricServer); listenErr != nil {
   898  			return fmt.Errorf("cannot listen on TCP address %q: %w", conf.MetricServer, listenErr)
   899  		}
   900  	}
   901  
   902  	mux := http.NewServeMux()
   903  	mux.HandleFunc("/runsc-metrics/healthcheck", logRequest(m.serveHealthCheck))
   904  	mux.HandleFunc("/runsc-metrics/pid", logRequest(m.servePID))
   905  	if m.exposeProfileEndpoints {
   906  		log.Warningf("Profiling HTTP endpoints are exposed; this should only be used for development!")
   907  		mux.HandleFunc("/runsc-metrics/profile-cpu", logRequest(m.profileCPU))
   908  		mux.HandleFunc("/runsc-metrics/profile-heap", logRequest(m.profileHeap))
   909  	} else {
   910  		// Disable memory profiling, since we don't expose it.
   911  		runtime.MemProfileRate = 0
   912  	}
   913  	mux.HandleFunc("/metrics", logRequest(m.serveMetrics))
   914  	mux.HandleFunc("/", logRequest(m.serveIndex))
   915  	m.srv.Handler = mux
   916  	m.srv.ReadTimeout = httpTimeout
   917  	m.srv.WriteTimeout = httpTimeout
   918  	if err := m.startVerifyLoop(ctx); err != nil {
   919  		return fmt.Errorf("cannot start background loop: %w", err)
   920  	}
   921  	if m.pidFile != "" {
   922  		if err := ioutil.WriteFile(m.pidFile, []byte(fmt.Sprintf("%d", m.pid)), 0644); err != nil {
   923  			return fmt.Errorf("cannot write PID to file %q: %w", m.pidFile, err)
   924  		}
   925  		defer os.Remove(m.pidFile)
   926  		log.Infof("Wrote PID %d to file %v.", m.pid, m.pidFile)
   927  	}
   928  
   929  	// If not modified by the user from the environment, set the Go GC percentage lower than default.
   930  	if _, hasEnv := os.LookupEnv("GOGC"); !hasEnv {
   931  		debug.SetGCPercent(40)
   932  	}
   933  
   934  	// Run GC immediately to get rid of all the initialization-related memory bloat and start from
   935  	// a clean slate.
   936  	state.Release()
   937  	runtime.GC()
   938  
   939  	// Initialization complete.
   940  	log.Infof("Server serving on %s for root directory %s.", conf.MetricServer, conf.RootDir)
   941  	serveErr := m.srv.Serve(listener)
   942  	log.Infof("Server has stopped accepting requests.")
   943  	m.mu.Lock()
   944  	defer m.mu.Unlock()
   945  	if serveErr != nil {
   946  		if serveErr == http.ErrServerClosed {
   947  			return nil
   948  		}
   949  		return fmt.Errorf("cannot serve on address %s: %w", conf.MetricServer, serveErr)
   950  	}
   951  	// Per documentation, http.Server.Serve can never return a nil error, so this is not a success.
   952  	return fmt.Errorf("HTTP server Serve() did not return expected error")
   953  }