gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/test/metricclient/metricclient.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package metricclient provides utility functions to start, stop, and talk to a metric server. 16 package metricclient 17 18 import ( 19 "bytes" 20 "context" 21 "errors" 22 "fmt" 23 "io" 24 "math" 25 "net" 26 "net/http" 27 "net/url" 28 "os" 29 "os/exec" 30 "strings" 31 "syscall" 32 "time" 33 34 "github.com/cenkalti/backoff" 35 "github.com/prometheus/common/expfmt" 36 "golang.org/x/sys/unix" 37 "gvisor.dev/gvisor/pkg/cleanup" 38 "gvisor.dev/gvisor/pkg/prometheus" 39 "gvisor.dev/gvisor/pkg/sync" 40 "gvisor.dev/gvisor/pkg/test/testutil" 41 "gvisor.dev/gvisor/runsc/config" 42 ) 43 44 // MetricClient implements an HTTP client that can spawn and connect to a running runsc metrics 45 // server process and register/unregister sandbox metrics. 46 type MetricClient struct { 47 addr string 48 rootDir string 49 dialer net.Dialer 50 client http.Client 51 mu sync.Mutex 52 server *exec.Cmd 53 } 54 55 // NewMetricClient creates a new MetricClient that can talk to the metric server at address addr. 56 func NewMetricClient(addr, rootDir string) *MetricClient { 57 c := &MetricClient{ 58 addr: strings.ReplaceAll(addr, "%RUNTIME_ROOT%", rootDir), 59 rootDir: rootDir, 60 dialer: net.Dialer{ 61 Timeout: 30 * time.Second, 62 KeepAlive: 30 * time.Second, 63 }, 64 client: http.Client{ 65 Transport: &http.Transport{ 66 // We only talk over the local network, so no need to spend CPU on compression. 67 DisableCompression: true, 68 MaxIdleConns: 1, 69 IdleConnTimeout: 30 * time.Second, 70 ResponseHeaderTimeout: 30 * time.Second, 71 ExpectContinueTimeout: 30 * time.Second, 72 }, 73 Timeout: 30 * time.Second, 74 }, 75 } 76 // In order to support talking HTTP over Unix domain sockets, we use a custom dialer 77 // which knows how to dial the right address. 78 // The HTTP address passed as URL to the client is ignored. 79 c.client.Transport.(*http.Transport).DialContext = c.dialContext 80 return c 81 } 82 83 // dialContext dials the metric server. It ignores whatever address is given to it. 84 func (c *MetricClient) dialContext(ctx context.Context, _, _ string) (net.Conn, error) { 85 network := "tcp" 86 if strings.HasPrefix(c.addr, fmt.Sprintf("%c", os.PathSeparator)) { 87 network = "unix" 88 } 89 return c.dialer.DialContext(ctx, network, c.addr) 90 } 91 92 // Close closes any idle HTTP connection. 93 func (c *MetricClient) Close() { 94 c.client.CloseIdleConnections() 95 } 96 97 // req performs an HTTP request against the metrics server. 98 // It returns an http.Response, and a function to close out the request that should be called when 99 // the response is no longer necessary. 100 func (c *MetricClient) req(ctx context.Context, timeout time.Duration, method, endpoint string, params map[string]string) (*http.Response, func(), error) { 101 cancelFunc := context.CancelFunc(func() {}) 102 if timeout != 0 { 103 ctx, cancelFunc = context.WithTimeout(ctx, timeout) 104 } 105 var bodyBytes io.Reader 106 var getSuffix string 107 if len(params) != 0 { 108 switch method { 109 case http.MethodGet: 110 getParams := url.Values{} 111 for k, v := range params { 112 getParams.Add(k, v) 113 } 114 getSuffix = fmt.Sprintf("?%s", getParams.Encode()) 115 case http.MethodPost: 116 values := url.Values{} 117 for k, v := range params { 118 values.Set(k, v) 119 } 120 bodyBytes = strings.NewReader(values.Encode()) 121 default: 122 cancelFunc() 123 return nil, nil, fmt.Errorf("unsupported method: %v", method) 124 } 125 } 126 req, err := http.NewRequestWithContext(ctx, method, fmt.Sprintf("http://runsc-metrics%s%s", endpoint, getSuffix), bodyBytes) 127 if err != nil { 128 cancelFunc() 129 return nil, nil, fmt.Errorf("cannot create request object: %v", err) 130 } 131 if method == http.MethodPost { 132 req.Header.Set("Content-Type", "application/x-www-form-urlencoded") 133 } 134 resp, err := c.client.Do(req) 135 if err != nil { 136 cancelFunc() 137 return nil, nil, err 138 } 139 return resp, func() { 140 resp.Body.Close() 141 cancelFunc() 142 }, err 143 } 144 145 // HealthCheck pokes the metrics server and checks that it is running. 146 func (c *MetricClient) HealthCheck(ctx context.Context) error { 147 // There are multiple scenarios here: 148 // - The server isn't running. We'll get a "connection failed" error. 149 // - There is an HTTP server bound to the address, but it is not the metric server. 150 // We'll fail the /runsc-metrics/healthcheck request with an HTTP error code. 151 // - There is a server bound to the address, but it is not the metric server and doesn't speak 152 // HTTP. We'll fail the request if that's the case. 153 // - There is a server bound to the address, it is the metric server, but it is not serving the 154 // same root directory. The server will reject the request if that's the case. 155 // - The server is running, and the /runsc-metrics/healthcheck request succeeds. 156 // - The server is running, but it is shutting down. The metrics server will fail the 157 // /runsc-metrics/healthcheck request in this case. 158 resp, closeReq, err := c.req(ctx, 5*time.Second, http.MethodPost, "/runsc-metrics/healthcheck", map[string]string{ 159 "root": c.rootDir, 160 }) 161 if err != nil { 162 return err 163 } 164 defer closeReq() 165 var buf bytes.Buffer 166 if _, err := buf.ReadFrom(resp.Body); err != nil { 167 return err 168 } 169 if !strings.HasPrefix(buf.String(), "runsc-metrics:OK") { 170 return errors.New("server responded to request but not with the expected prefix") 171 } 172 return nil 173 } 174 175 // SpawnServer starts a metric server at the expected address. 176 // It blocks until it responds to healthchecks, or the context expires. 177 // Fails if the server fails to start or to bind within the context. 178 // Callers should call ShutdownServer to stop the server. 179 // A running server must be stopped before a new one can be successfully started. 180 // baseConf is used for passing other flags to the server, e.g. debug log directory. 181 func (c *MetricClient) SpawnServer(ctx context.Context, baseConf *config.Config, extraArgs ...string) error { 182 metricServerBinPath, err := testutil.FindFile("runsc/cmd/metricserver/metricserver_bin") 183 if err != nil { 184 return fmt.Errorf("cannot find metricserver_bin: %w", err) 185 } 186 c.mu.Lock() 187 defer c.mu.Unlock() 188 if c.server != nil { 189 return errors.New("this metric client already has a server associated with it") 190 } 191 bindCtx, bindCancel := context.WithTimeout(ctx, 20*time.Second) 192 defer bindCancel() 193 launchBackoff := backoff.WithContext(&backoff.ExponentialBackOff{ 194 InitialInterval: time.Millisecond, 195 Multiplier: 1.5, 196 MaxInterval: 250 * time.Millisecond, 197 RandomizationFactor: 0.1, 198 Clock: backoff.SystemClock, 199 }, bindCtx) 200 // Overriden metric server address with the address this metric client is configured to use. 201 // This should be the same but may contain string replacements (e.g. "%ID%"). 202 overriddenConf := *baseConf 203 overriddenConf.MetricServer = c.addr 204 overriddenConf.RootDir = c.rootDir 205 c.server = exec.Command(metricServerBinPath, overriddenConf.ToFlags()...) 206 cu := cleanup.Make(func() { 207 c.server = nil 208 }) 209 defer cu.Clean() 210 c.server.SysProcAttr = &unix.SysProcAttr{ 211 // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT 212 // when re-parented. 213 Setsid: true, 214 } 215 devnull, err := os.OpenFile(os.DevNull, os.O_RDWR, 0755) 216 if err != nil { 217 return fmt.Errorf("cannot open devnull at %s: %w", os.DevNull, err) 218 } 219 defer devnull.Close() // Don't leak file descriptors. 220 c.server.Stdin = devnull 221 c.server.Stdout = devnull 222 c.server.Stderr = devnull 223 // Set Args[0] to make easier to spot the sandbox process. Otherwise it's 224 // shown as `exe`. 225 c.server.Args[0] = "runsc-metrics" 226 c.server.Args = append(c.server.Args, "metric-server") 227 c.server.Args = append(c.server.Args, extraArgs...) 228 if err := c.server.Start(); err != nil { 229 return fmt.Errorf("cannot start metrics server: %w", err) 230 } 231 launchBackoff.Reset() 232 for bindCtx.Err() == nil && c.HealthCheck(bindCtx) != nil { 233 nextBackoff := launchBackoff.NextBackOff() 234 if nextBackoff == backoff.Stop { 235 break 236 } 237 time.Sleep(nextBackoff) 238 } 239 if err := unix.Kill(c.server.Process.Pid, 0); err != nil { 240 return fmt.Errorf("metrics server crashed: %w", c.server.Wait()) 241 } 242 if bindCtx.Err() != nil { 243 return fmt.Errorf("metrics server did not bind to %s in time: %w", c.addr, bindCtx.Err()) 244 } 245 cu.Release() 246 return nil 247 } 248 249 // ShutdownServer asks the metrics server to shut itself down. 250 // It blocks until the server process has exitted or the context expires. 251 func (c *MetricClient) ShutdownServer(ctx context.Context) error { 252 c.mu.Lock() 253 defer c.mu.Unlock() 254 if c.server == nil { 255 return errors.New("server not started") 256 } 257 c.Close() 258 // The server will shut itself down ASAP after it gets SIGTERM. 259 if err := c.server.Process.Signal(syscall.SIGTERM); err != nil { 260 return fmt.Errorf("cannot send signal to metrics server: %w", err) 261 } 262 // Wait for the process to exit. 263 if err := c.server.Wait(); err != nil { 264 // When used in tests that use testutil.Reaper, it's possible that the metric server 265 // has already been reaped by it. In this case, do not treat this as an error. 266 if strings.Contains(err.Error(), "no child process") { 267 c.server = nil 268 return nil 269 } 270 return fmt.Errorf("failed to wait for metrics server to exit: %w", err) 271 } 272 c.server = nil 273 return nil 274 } 275 276 // MetricData is the raw contents returned by GetMetrics, with helper functions 277 // to extract single values out of it. 278 type MetricData string 279 280 // GetMetrics returns the raw Prometheus-formatted metric data from the metric server. 281 // `urlParams` may contain a special parameter with the empty string as the key. 282 // If this is set, that string is used to override the request path from its default 283 // value of `/metrics`. 284 func (c *MetricClient) GetMetrics(ctx context.Context, urlParams map[string]string) (MetricData, error) { 285 path := "/metrics" 286 if overridePath, found := urlParams[""]; found { 287 path = overridePath 288 delete(urlParams, "") 289 } 290 resp, closeReq, err := c.req(ctx, 10*time.Second, http.MethodGet, path, urlParams) 291 if err != nil { 292 return "", fmt.Errorf("cannot get /metrics: %v", err) 293 } 294 defer closeReq() 295 var buf bytes.Buffer 296 if _, err := buf.ReadFrom(resp.Body); err != nil { 297 return "", fmt.Errorf("cannot read from response body: %v", err) 298 } 299 return MetricData(buf.String()), nil 300 } 301 302 // GetPrometheusInteger returns the integer value of a Prometheus metric with given name and labels. 303 func (m MetricData) GetPrometheusInteger(metricName string, wantLabels map[string]string) (int64, time.Time, error) { 304 // Parse raw Prometheus-formatted data. 305 var buf bytes.Buffer 306 buf.WriteString(string(m)) 307 parsed, err := (&expfmt.TextParser{}).TextToMetricFamilies(&buf) 308 if err != nil { 309 return 0, time.Time{}, err 310 } 311 // See if there is any data for the given metric name. 312 metricData, found := parsed[metricName] 313 if !found { 314 return 0, time.Time{}, fmt.Errorf("metric %q not found", metricName) 315 } 316 // See if we can find exactly one data point for which the labels match `wantLabels`. 317 // foundIndex is the index within `metricData.Metric` of the most-recently-found data point 318 // that matches `wantLabels`. 319 foundIndex := -1 320 for i, data := range metricData.GetMetric() { 321 // Convert data.Label (which is a list of key-value tuples) into a Go map. 322 dataLabels := make(map[string]string, len(data.GetLabel())) 323 for _, label := range data.GetLabel() { 324 dataLabels[label.GetName()] = label.GetValue() 325 } 326 // Check if `wantLabels` is a subset of `dataLabels`. 327 allMatching := true 328 for wantLabel, wantValue := range wantLabels { 329 if dataLabels[wantLabel] != wantValue { 330 allMatching = false 331 break 332 } 333 } 334 if !allMatching { 335 // This data point is for a different label combination than the one we want. 336 continue 337 } 338 // Record the index at which we found this data point within `metricData.Metric`. 339 // If this index isn't -1, this means we found multiple such indexes. 340 // This could happen if the metric has multiple data points with `wantLabels` + an 341 // additional label which isn't in `wantLabels` and which takes on multiple distinct 342 // values. This function doesn't support retrieving data for such cases. 343 if foundIndex != -1 { 344 return 0, time.Time{}, fmt.Errorf("found multiple metric data matching requested labels %v", wantLabels) 345 } 346 foundIndex = i 347 } 348 if foundIndex == -1 { 349 return 0, time.Time{}, fmt.Errorf("no metric data matching requested labels %v", wantLabels) 350 } 351 // We've found exactly one data point. 352 data := metricData.GetMetric()[foundIndex] 353 // Convert the value of this data point to an int regardless of its underlying Prometheus type. 354 var floatValue float64 355 if data.GetCounter() != nil && data.GetCounter().Value != nil { 356 floatValue = data.GetCounter().GetValue() 357 } else if data.GetGauge() != nil && data.GetGauge().Value != nil { 358 floatValue = data.GetGauge().GetValue() 359 } else { 360 return 0, time.Time{}, fmt.Errorf("metric is not numerical: %v", data) 361 } 362 if math.Floor(floatValue) != floatValue { 363 return 0, time.Time{}, fmt.Errorf("value %v cannot be rounded to an integer", floatValue) 364 } 365 return int64(math.Floor(floatValue)), time.UnixMilli(data.GetTimestampMs()), nil 366 } 367 368 // WantMetric designates the metadata required to select a single metric from a single sandbox. 369 type WantMetric struct { 370 // Metric is the name of the metric to get. 371 Metric string 372 // Sandbox is the ID of the sandbox to look up the metric for. 373 Sandbox string 374 // Pod and Namespace are the pod and namespace labels associated with the sandbox. 375 // Leave empty if the sandbox metadata doesn't contain this information. 376 Pod, Namespace string 377 // ExtraLabels are additional key-value labels that must match. 378 ExtraLabels map[string]string 379 } 380 381 // GetPrometheusContainerInteger returns the integer value of a Prometheus metric from the 382 // given WantMetric data. 383 func (m MetricData) GetPrometheusContainerInteger(want WantMetric) (int64, time.Time, error) { 384 labels := map[string]string{ 385 "sandbox": want.Sandbox, 386 } 387 if want.Pod != "" { 388 labels["pod_name"] = want.Pod 389 } 390 if want.Namespace != "" { 391 labels["namespace_name"] = want.Namespace 392 } 393 for k, v := range want.ExtraLabels { 394 labels[k] = v 395 } 396 return m.GetPrometheusInteger(want.Metric, labels) 397 } 398 399 // GetSandboxMetadataMetric returns the labels attached to the metadata metric for a given sandbox. 400 func (m MetricData) GetSandboxMetadataMetric(want WantMetric) (map[string]string, error) { 401 var buf bytes.Buffer 402 buf.WriteString(string(m)) 403 parsed, err := (&expfmt.TextParser{}).TextToMetricFamilies(&buf) 404 if err != nil { 405 return nil, err 406 } 407 metricData, found := parsed[want.Metric] 408 if !found { 409 return nil, fmt.Errorf("metric %q not found", want.Metric) 410 } 411 foundIndex := -1 412 for i, data := range metricData.GetMetric() { 413 dataLabels := make(map[string]string, len(data.GetLabel())) 414 for _, label := range data.GetLabel() { 415 dataLabels[label.GetName()] = label.GetValue() 416 } 417 allMatching := true 418 for wantLabel, wantValue := range map[string]string{ 419 prometheus.SandboxIDLabel: want.Sandbox, 420 prometheus.NamespaceLabel: want.Namespace, 421 prometheus.PodNameLabel: want.Pod, 422 } { 423 if dataLabels[wantLabel] != wantValue { 424 allMatching = false 425 break 426 } 427 } 428 if allMatching { 429 if foundIndex != -1 { 430 return nil, errors.New("found multiple metadata metrics matching requested labels") 431 } 432 foundIndex = i 433 } 434 } 435 if foundIndex == -1 { 436 return nil, errors.New("no metadata metric matching requested labels") 437 } 438 data := metricData.GetMetric()[foundIndex] 439 metadataLabels := make(map[string]string, len(data.GetLabel())) 440 for _, label := range data.GetLabel() { 441 if label.GetName() == prometheus.SandboxIDLabel || label.GetName() == prometheus.NamespaceLabel || label.GetName() == prometheus.PodNameLabel { 442 continue 443 } 444 metadataLabels[label.GetName()] = label.GetValue() 445 } 446 return metadataLabels, nil 447 }