gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/test/metricclient/metricclient.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package metricclient provides utility functions to start, stop, and talk to a metric server.
    16  package metricclient
    17  
    18  import (
    19  	"bytes"
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"math"
    25  	"net"
    26  	"net/http"
    27  	"net/url"
    28  	"os"
    29  	"os/exec"
    30  	"strings"
    31  	"syscall"
    32  	"time"
    33  
    34  	"github.com/cenkalti/backoff"
    35  	"github.com/prometheus/common/expfmt"
    36  	"golang.org/x/sys/unix"
    37  	"gvisor.dev/gvisor/pkg/cleanup"
    38  	"gvisor.dev/gvisor/pkg/prometheus"
    39  	"gvisor.dev/gvisor/pkg/sync"
    40  	"gvisor.dev/gvisor/pkg/test/testutil"
    41  	"gvisor.dev/gvisor/runsc/config"
    42  )
    43  
    44  // MetricClient implements an HTTP client that can spawn and connect to a running runsc metrics
    45  // server process and register/unregister sandbox metrics.
    46  type MetricClient struct {
    47  	addr    string
    48  	rootDir string
    49  	dialer  net.Dialer
    50  	client  http.Client
    51  	mu      sync.Mutex
    52  	server  *exec.Cmd
    53  }
    54  
    55  // NewMetricClient creates a new MetricClient that can talk to the metric server at address addr.
    56  func NewMetricClient(addr, rootDir string) *MetricClient {
    57  	c := &MetricClient{
    58  		addr:    strings.ReplaceAll(addr, "%RUNTIME_ROOT%", rootDir),
    59  		rootDir: rootDir,
    60  		dialer: net.Dialer{
    61  			Timeout:   30 * time.Second,
    62  			KeepAlive: 30 * time.Second,
    63  		},
    64  		client: http.Client{
    65  			Transport: &http.Transport{
    66  				// We only talk over the local network, so no need to spend CPU on compression.
    67  				DisableCompression:    true,
    68  				MaxIdleConns:          1,
    69  				IdleConnTimeout:       30 * time.Second,
    70  				ResponseHeaderTimeout: 30 * time.Second,
    71  				ExpectContinueTimeout: 30 * time.Second,
    72  			},
    73  			Timeout: 30 * time.Second,
    74  		},
    75  	}
    76  	// In order to support talking HTTP over Unix domain sockets, we use a custom dialer
    77  	// which knows how to dial the right address.
    78  	// The HTTP address passed as URL to the client is ignored.
    79  	c.client.Transport.(*http.Transport).DialContext = c.dialContext
    80  	return c
    81  }
    82  
    83  // dialContext dials the metric server. It ignores whatever address is given to it.
    84  func (c *MetricClient) dialContext(ctx context.Context, _, _ string) (net.Conn, error) {
    85  	network := "tcp"
    86  	if strings.HasPrefix(c.addr, fmt.Sprintf("%c", os.PathSeparator)) {
    87  		network = "unix"
    88  	}
    89  	return c.dialer.DialContext(ctx, network, c.addr)
    90  }
    91  
    92  // Close closes any idle HTTP connection.
    93  func (c *MetricClient) Close() {
    94  	c.client.CloseIdleConnections()
    95  }
    96  
    97  // req performs an HTTP request against the metrics server.
    98  // It returns an http.Response, and a function to close out the request that should be called when
    99  // the response is no longer necessary.
   100  func (c *MetricClient) req(ctx context.Context, timeout time.Duration, method, endpoint string, params map[string]string) (*http.Response, func(), error) {
   101  	cancelFunc := context.CancelFunc(func() {})
   102  	if timeout != 0 {
   103  		ctx, cancelFunc = context.WithTimeout(ctx, timeout)
   104  	}
   105  	var bodyBytes io.Reader
   106  	var getSuffix string
   107  	if len(params) != 0 {
   108  		switch method {
   109  		case http.MethodGet:
   110  			getParams := url.Values{}
   111  			for k, v := range params {
   112  				getParams.Add(k, v)
   113  			}
   114  			getSuffix = fmt.Sprintf("?%s", getParams.Encode())
   115  		case http.MethodPost:
   116  			values := url.Values{}
   117  			for k, v := range params {
   118  				values.Set(k, v)
   119  			}
   120  			bodyBytes = strings.NewReader(values.Encode())
   121  		default:
   122  			cancelFunc()
   123  			return nil, nil, fmt.Errorf("unsupported method: %v", method)
   124  		}
   125  	}
   126  	req, err := http.NewRequestWithContext(ctx, method, fmt.Sprintf("http://runsc-metrics%s%s", endpoint, getSuffix), bodyBytes)
   127  	if err != nil {
   128  		cancelFunc()
   129  		return nil, nil, fmt.Errorf("cannot create request object: %v", err)
   130  	}
   131  	if method == http.MethodPost {
   132  		req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
   133  	}
   134  	resp, err := c.client.Do(req)
   135  	if err != nil {
   136  		cancelFunc()
   137  		return nil, nil, err
   138  	}
   139  	return resp, func() {
   140  		resp.Body.Close()
   141  		cancelFunc()
   142  	}, err
   143  }
   144  
   145  // HealthCheck pokes the metrics server and checks that it is running.
   146  func (c *MetricClient) HealthCheck(ctx context.Context) error {
   147  	// There are multiple scenarios here:
   148  	//  - The server isn't running. We'll get a "connection failed" error.
   149  	//  - There is an HTTP server bound to the address, but it is not the metric server.
   150  	//    We'll fail the /runsc-metrics/healthcheck request with an HTTP error code.
   151  	//  - There is a server bound to the address, but it is not the metric server and doesn't speak
   152  	//    HTTP. We'll fail the request if that's the case.
   153  	//  - There is a server bound to the address, it is the metric server, but it is not serving the
   154  	//    same root directory. The server will reject the request if that's the case.
   155  	//  - The server is running, and the /runsc-metrics/healthcheck request succeeds.
   156  	//  - The server is running, but it is shutting down. The metrics server will fail the
   157  	//    /runsc-metrics/healthcheck request in this case.
   158  	resp, closeReq, err := c.req(ctx, 5*time.Second, http.MethodPost, "/runsc-metrics/healthcheck", map[string]string{
   159  		"root": c.rootDir,
   160  	})
   161  	if err != nil {
   162  		return err
   163  	}
   164  	defer closeReq()
   165  	var buf bytes.Buffer
   166  	if _, err := buf.ReadFrom(resp.Body); err != nil {
   167  		return err
   168  	}
   169  	if !strings.HasPrefix(buf.String(), "runsc-metrics:OK") {
   170  		return errors.New("server responded to request but not with the expected prefix")
   171  	}
   172  	return nil
   173  }
   174  
   175  // SpawnServer starts a metric server at the expected address.
   176  // It blocks until it responds to healthchecks, or the context expires.
   177  // Fails if the server fails to start or to bind within the context.
   178  // Callers should call ShutdownServer to stop the server.
   179  // A running server must be stopped before a new one can be successfully started.
   180  // baseConf is used for passing other flags to the server, e.g. debug log directory.
   181  func (c *MetricClient) SpawnServer(ctx context.Context, baseConf *config.Config, extraArgs ...string) error {
   182  	metricServerBinPath, err := testutil.FindFile("runsc/cmd/metricserver/metricserver_bin")
   183  	if err != nil {
   184  		return fmt.Errorf("cannot find metricserver_bin: %w", err)
   185  	}
   186  	c.mu.Lock()
   187  	defer c.mu.Unlock()
   188  	if c.server != nil {
   189  		return errors.New("this metric client already has a server associated with it")
   190  	}
   191  	bindCtx, bindCancel := context.WithTimeout(ctx, 20*time.Second)
   192  	defer bindCancel()
   193  	launchBackoff := backoff.WithContext(&backoff.ExponentialBackOff{
   194  		InitialInterval:     time.Millisecond,
   195  		Multiplier:          1.5,
   196  		MaxInterval:         250 * time.Millisecond,
   197  		RandomizationFactor: 0.1,
   198  		Clock:               backoff.SystemClock,
   199  	}, bindCtx)
   200  	// Overriden metric server address with the address this metric client is configured to use.
   201  	// This should be the same but may contain string replacements (e.g. "%ID%").
   202  	overriddenConf := *baseConf
   203  	overriddenConf.MetricServer = c.addr
   204  	overriddenConf.RootDir = c.rootDir
   205  	c.server = exec.Command(metricServerBinPath, overriddenConf.ToFlags()...)
   206  	cu := cleanup.Make(func() {
   207  		c.server = nil
   208  	})
   209  	defer cu.Clean()
   210  	c.server.SysProcAttr = &unix.SysProcAttr{
   211  		// Detach from this session, otherwise cmd will get SIGHUP and SIGCONT
   212  		// when re-parented.
   213  		Setsid: true,
   214  	}
   215  	devnull, err := os.OpenFile(os.DevNull, os.O_RDWR, 0755)
   216  	if err != nil {
   217  		return fmt.Errorf("cannot open devnull at %s: %w", os.DevNull, err)
   218  	}
   219  	defer devnull.Close() // Don't leak file descriptors.
   220  	c.server.Stdin = devnull
   221  	c.server.Stdout = devnull
   222  	c.server.Stderr = devnull
   223  	// Set Args[0] to make easier to spot the sandbox process. Otherwise it's
   224  	// shown as `exe`.
   225  	c.server.Args[0] = "runsc-metrics"
   226  	c.server.Args = append(c.server.Args, "metric-server")
   227  	c.server.Args = append(c.server.Args, extraArgs...)
   228  	if err := c.server.Start(); err != nil {
   229  		return fmt.Errorf("cannot start metrics server: %w", err)
   230  	}
   231  	launchBackoff.Reset()
   232  	for bindCtx.Err() == nil && c.HealthCheck(bindCtx) != nil {
   233  		nextBackoff := launchBackoff.NextBackOff()
   234  		if nextBackoff == backoff.Stop {
   235  			break
   236  		}
   237  		time.Sleep(nextBackoff)
   238  	}
   239  	if err := unix.Kill(c.server.Process.Pid, 0); err != nil {
   240  		return fmt.Errorf("metrics server crashed: %w", c.server.Wait())
   241  	}
   242  	if bindCtx.Err() != nil {
   243  		return fmt.Errorf("metrics server did not bind to %s in time: %w", c.addr, bindCtx.Err())
   244  	}
   245  	cu.Release()
   246  	return nil
   247  }
   248  
   249  // ShutdownServer asks the metrics server to shut itself down.
   250  // It blocks until the server process has exitted or the context expires.
   251  func (c *MetricClient) ShutdownServer(ctx context.Context) error {
   252  	c.mu.Lock()
   253  	defer c.mu.Unlock()
   254  	if c.server == nil {
   255  		return errors.New("server not started")
   256  	}
   257  	c.Close()
   258  	// The server will shut itself down ASAP after it gets SIGTERM.
   259  	if err := c.server.Process.Signal(syscall.SIGTERM); err != nil {
   260  		return fmt.Errorf("cannot send signal to metrics server: %w", err)
   261  	}
   262  	// Wait for the process to exit.
   263  	if err := c.server.Wait(); err != nil {
   264  		// When used in tests that use testutil.Reaper, it's possible that the metric server
   265  		// has already been reaped by it. In this case, do not treat this as an error.
   266  		if strings.Contains(err.Error(), "no child process") {
   267  			c.server = nil
   268  			return nil
   269  		}
   270  		return fmt.Errorf("failed to wait for metrics server to exit: %w", err)
   271  	}
   272  	c.server = nil
   273  	return nil
   274  }
   275  
   276  // MetricData is the raw contents returned by GetMetrics, with helper functions
   277  // to extract single values out of it.
   278  type MetricData string
   279  
   280  // GetMetrics returns the raw Prometheus-formatted metric data from the metric server.
   281  // `urlParams` may contain a special parameter with the empty string as the key.
   282  // If this is set, that string is used to override the request path from its default
   283  // value of `/metrics`.
   284  func (c *MetricClient) GetMetrics(ctx context.Context, urlParams map[string]string) (MetricData, error) {
   285  	path := "/metrics"
   286  	if overridePath, found := urlParams[""]; found {
   287  		path = overridePath
   288  		delete(urlParams, "")
   289  	}
   290  	resp, closeReq, err := c.req(ctx, 10*time.Second, http.MethodGet, path, urlParams)
   291  	if err != nil {
   292  		return "", fmt.Errorf("cannot get /metrics: %v", err)
   293  	}
   294  	defer closeReq()
   295  	var buf bytes.Buffer
   296  	if _, err := buf.ReadFrom(resp.Body); err != nil {
   297  		return "", fmt.Errorf("cannot read from response body: %v", err)
   298  	}
   299  	return MetricData(buf.String()), nil
   300  }
   301  
   302  // GetPrometheusInteger returns the integer value of a Prometheus metric with given name and labels.
   303  func (m MetricData) GetPrometheusInteger(metricName string, wantLabels map[string]string) (int64, time.Time, error) {
   304  	// Parse raw Prometheus-formatted data.
   305  	var buf bytes.Buffer
   306  	buf.WriteString(string(m))
   307  	parsed, err := (&expfmt.TextParser{}).TextToMetricFamilies(&buf)
   308  	if err != nil {
   309  		return 0, time.Time{}, err
   310  	}
   311  	// See if there is any data for the given metric name.
   312  	metricData, found := parsed[metricName]
   313  	if !found {
   314  		return 0, time.Time{}, fmt.Errorf("metric %q not found", metricName)
   315  	}
   316  	// See if we can find exactly one data point for which the labels match `wantLabels`.
   317  	// foundIndex is the index within `metricData.Metric` of the most-recently-found data point
   318  	// that matches `wantLabels`.
   319  	foundIndex := -1
   320  	for i, data := range metricData.GetMetric() {
   321  		// Convert data.Label (which is a list of key-value tuples) into a Go map.
   322  		dataLabels := make(map[string]string, len(data.GetLabel()))
   323  		for _, label := range data.GetLabel() {
   324  			dataLabels[label.GetName()] = label.GetValue()
   325  		}
   326  		// Check if `wantLabels` is a subset of `dataLabels`.
   327  		allMatching := true
   328  		for wantLabel, wantValue := range wantLabels {
   329  			if dataLabels[wantLabel] != wantValue {
   330  				allMatching = false
   331  				break
   332  			}
   333  		}
   334  		if !allMatching {
   335  			// This data point is for a different label combination than the one we want.
   336  			continue
   337  		}
   338  		// Record the index at which we found this data point within `metricData.Metric`.
   339  		// If this index isn't -1, this means we found multiple such indexes.
   340  		// This could happen if the metric has multiple data points with `wantLabels` + an
   341  		// additional label which isn't in `wantLabels` and which takes on multiple distinct
   342  		// values. This function doesn't support retrieving data for such cases.
   343  		if foundIndex != -1 {
   344  			return 0, time.Time{}, fmt.Errorf("found multiple metric data matching requested labels %v", wantLabels)
   345  		}
   346  		foundIndex = i
   347  	}
   348  	if foundIndex == -1 {
   349  		return 0, time.Time{}, fmt.Errorf("no metric data matching requested labels %v", wantLabels)
   350  	}
   351  	// We've found exactly one data point.
   352  	data := metricData.GetMetric()[foundIndex]
   353  	// Convert the value of this data point to an int regardless of its underlying Prometheus type.
   354  	var floatValue float64
   355  	if data.GetCounter() != nil && data.GetCounter().Value != nil {
   356  		floatValue = data.GetCounter().GetValue()
   357  	} else if data.GetGauge() != nil && data.GetGauge().Value != nil {
   358  		floatValue = data.GetGauge().GetValue()
   359  	} else {
   360  		return 0, time.Time{}, fmt.Errorf("metric is not numerical: %v", data)
   361  	}
   362  	if math.Floor(floatValue) != floatValue {
   363  		return 0, time.Time{}, fmt.Errorf("value %v cannot be rounded to an integer", floatValue)
   364  	}
   365  	return int64(math.Floor(floatValue)), time.UnixMilli(data.GetTimestampMs()), nil
   366  }
   367  
   368  // WantMetric designates the metadata required to select a single metric from a single sandbox.
   369  type WantMetric struct {
   370  	// Metric is the name of the metric to get.
   371  	Metric string
   372  	// Sandbox is the ID of the sandbox to look up the metric for.
   373  	Sandbox string
   374  	// Pod and Namespace are the pod and namespace labels associated with the sandbox.
   375  	// Leave empty if the sandbox metadata doesn't contain this information.
   376  	Pod, Namespace string
   377  	// ExtraLabels are additional key-value labels that must match.
   378  	ExtraLabels map[string]string
   379  }
   380  
   381  // GetPrometheusContainerInteger returns the integer value of a Prometheus metric from the
   382  // given WantMetric data.
   383  func (m MetricData) GetPrometheusContainerInteger(want WantMetric) (int64, time.Time, error) {
   384  	labels := map[string]string{
   385  		"sandbox": want.Sandbox,
   386  	}
   387  	if want.Pod != "" {
   388  		labels["pod_name"] = want.Pod
   389  	}
   390  	if want.Namespace != "" {
   391  		labels["namespace_name"] = want.Namespace
   392  	}
   393  	for k, v := range want.ExtraLabels {
   394  		labels[k] = v
   395  	}
   396  	return m.GetPrometheusInteger(want.Metric, labels)
   397  }
   398  
   399  // GetSandboxMetadataMetric returns the labels attached to the metadata metric for a given sandbox.
   400  func (m MetricData) GetSandboxMetadataMetric(want WantMetric) (map[string]string, error) {
   401  	var buf bytes.Buffer
   402  	buf.WriteString(string(m))
   403  	parsed, err := (&expfmt.TextParser{}).TextToMetricFamilies(&buf)
   404  	if err != nil {
   405  		return nil, err
   406  	}
   407  	metricData, found := parsed[want.Metric]
   408  	if !found {
   409  		return nil, fmt.Errorf("metric %q not found", want.Metric)
   410  	}
   411  	foundIndex := -1
   412  	for i, data := range metricData.GetMetric() {
   413  		dataLabels := make(map[string]string, len(data.GetLabel()))
   414  		for _, label := range data.GetLabel() {
   415  			dataLabels[label.GetName()] = label.GetValue()
   416  		}
   417  		allMatching := true
   418  		for wantLabel, wantValue := range map[string]string{
   419  			prometheus.SandboxIDLabel: want.Sandbox,
   420  			prometheus.NamespaceLabel: want.Namespace,
   421  			prometheus.PodNameLabel:   want.Pod,
   422  		} {
   423  			if dataLabels[wantLabel] != wantValue {
   424  				allMatching = false
   425  				break
   426  			}
   427  		}
   428  		if allMatching {
   429  			if foundIndex != -1 {
   430  				return nil, errors.New("found multiple metadata metrics matching requested labels")
   431  			}
   432  			foundIndex = i
   433  		}
   434  	}
   435  	if foundIndex == -1 {
   436  		return nil, errors.New("no metadata metric matching requested labels")
   437  	}
   438  	data := metricData.GetMetric()[foundIndex]
   439  	metadataLabels := make(map[string]string, len(data.GetLabel()))
   440  	for _, label := range data.GetLabel() {
   441  		if label.GetName() == prometheus.SandboxIDLabel || label.GetName() == prometheus.NamespaceLabel || label.GetName() == prometheus.PodNameLabel {
   442  			continue
   443  		}
   444  		metadataLabels[label.GetName()] = label.GetValue()
   445  	}
   446  	return metadataLabels, nil
   447  }