gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/metricserver/metricserver.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package metricserver implements a Prometheus metric server for runsc data. 16 package metricserver 17 18 import ( 19 "context" 20 "errors" 21 "fmt" 22 "io/ioutil" 23 "math/rand" 24 "net" 25 "net/http" 26 "os" 27 "os/signal" 28 "regexp" 29 "runtime" 30 "runtime/debug" 31 "strconv" 32 "strings" 33 "syscall" 34 "time" 35 36 "gvisor.dev/gvisor/pkg/abi/linux" 37 "gvisor.dev/gvisor/pkg/atomicbitops" 38 "gvisor.dev/gvisor/pkg/log" 39 "gvisor.dev/gvisor/pkg/prometheus" 40 "gvisor.dev/gvisor/pkg/sentry/control" 41 "gvisor.dev/gvisor/pkg/state" 42 "gvisor.dev/gvisor/pkg/sync" 43 "gvisor.dev/gvisor/runsc/config" 44 "gvisor.dev/gvisor/runsc/container" 45 "gvisor.dev/gvisor/runsc/metricserver/containermetrics" 46 "gvisor.dev/gvisor/runsc/sandbox" 47 ) 48 49 const ( 50 // metricsExportTimeout is the maximum amount of time that the metrics export process should take. 51 metricsExportTimeout = 30 * time.Second 52 53 // metricsExportPerSandboxTimeout is the maximum amount of time that we wait on any individual 54 // sandbox when exporting its metrics. 55 metricsExportPerSandboxTimeout = 8 * time.Second 56 57 // exportParallelGoroutines is the maximum number of goroutines spawned during metrics export. 58 exportParallelGoroutines = 8 59 ) 60 61 // servedSandbox is a sandbox that we serve metrics from. 62 // A single metrics server will export data about multiple sandboxes. 63 type servedSandbox struct { 64 rootContainerID container.FullID 65 server *metricServer 66 extraLabels map[string]string 67 68 // mu protects the fields below. 69 mu sync.Mutex 70 71 // sandbox is the sandbox being monitored. 72 // Once set, it is immutable. 73 sandbox *sandbox.Sandbox 74 75 // createdAt stores the time the sandbox was created. 76 // It is loaded from the container state file. 77 // Once set, it is immutable. 78 createdAt time.Time 79 80 // capabilities is the union of the capability set of the containers within `sandbox`. 81 // It is used to export a per-sandbox metric representing which capabilities are in use. 82 // For monitoring purposes, a capability added in a container means it is considered 83 // added for the whole sandbox. 84 capabilities []linux.Capability 85 86 // specMetadataLabels is the set of label exported as part of the 87 // `spec_metadata` metric. 88 specMetadataLabels map[string]string 89 90 // verifier allows verifying the data integrity of the metrics we get from this sandbox. 91 // It is not always initialized when the sandbox is discovered, but rather upon first metrics 92 // access to the sandbox. Metric registration data is loaded from the root container's 93 // state file. 94 // The server needs to load this registration data before any data from this sandbox is 95 // served to HTTP clients. If there is no metric registration data within the Container 96 // data, then metrics were not requested for this sandbox, and this servedSandbox should 97 // be deleted from the server. 98 // Once set, it is immutable. 99 verifier *prometheus.Verifier 100 101 // cleanupVerifier holds a reference to the cleanup function of the verifier. 102 cleanupVerifier func() 103 104 // extra contains additional per-sandbox data. 105 extra sandboxData 106 } 107 108 // load loads the sandbox being monitored and initializes its metric verifier. 109 // If it returns an error other than container.ErrStateFileLocked, the sandbox is either 110 // non-existent, or has not requested instrumentation to be enabled, or does not have 111 // valid metric registration data. In any of these cases, the sandbox should be removed 112 // from this metrics server. 113 func (s *servedSandbox) load() (*sandbox.Sandbox, *prometheus.Verifier, error) { 114 s.mu.Lock() 115 defer s.mu.Unlock() 116 if s.sandbox == nil { 117 allContainers, err := container.LoadSandbox(s.server.rootDir, s.rootContainerID.SandboxID, container.LoadOpts{ 118 TryLock: container.TryAcquire, 119 }) 120 if err != nil { 121 return nil, nil, fmt.Errorf("cannot load sandbox %q: %v", s.rootContainerID.SandboxID, err) 122 } 123 var rootContainer *container.Container 124 for _, cont := range allContainers { 125 if cont.IsSandboxRoot() { 126 if rootContainer != nil { 127 return nil, nil, fmt.Errorf("multiple root contains found for sandbox ID %q: %v and %v", s.rootContainerID.SandboxID, cont, rootContainer) 128 } 129 rootContainer = cont 130 } 131 } 132 if rootContainer == nil { 133 return nil, nil, fmt.Errorf("no root container found for sandbox ID %q", s.rootContainerID.SandboxID) 134 } 135 sandboxMetricAddr := strings.ReplaceAll(rootContainer.Sandbox.MetricServerAddress, "%RUNTIME_ROOT%", s.server.rootDir) 136 if sandboxMetricAddr == "" { 137 return nil, nil, errors.New("sandbox did not request instrumentation") 138 } 139 if sandboxMetricAddr != s.server.address { 140 return nil, nil, fmt.Errorf("sandbox requested instrumentation by a metric server running at a different address (sandbox wants %q, this metric server serves %q)", sandboxMetricAddr, s.server.address) 141 } 142 // Update label data as read from the state file. 143 // Do not store empty labels. 144 authoritativeLabels, err := containermetrics.SandboxPrometheusLabels(rootContainer) 145 if err != nil { 146 return nil, nil, fmt.Errorf("cannot compute Prometheus labels of sandbox: %v", err) 147 } 148 s.extraLabels = make(map[string]string, len(authoritativeLabels)) 149 for _, label := range []string{ 150 prometheus.SandboxIDLabel, 151 prometheus.IterationIDLabel, 152 prometheus.PodNameLabel, 153 prometheus.NamespaceLabel, 154 } { 155 s.extraLabels[label] = authoritativeLabels[label] 156 if s.extraLabels[label] == "" { 157 delete(s.extraLabels, label) 158 } 159 } 160 161 // Compute capability set. 162 allCaps := linux.AllCapabilities() 163 capSet := make([]linux.Capability, 0, len(allCaps)) 164 for _, cap := range allCaps { 165 for _, cont := range allContainers { 166 if cont.HasCapabilityInAnySet(cap) { 167 capSet = append(capSet, cap) 168 break 169 } 170 } 171 } 172 if len(capSet) > 0 { 173 // Reallocate a slice with minimum size, since it will be long-lived. 174 s.capabilities = make([]linux.Capability, len(capSet)) 175 for i, capLabels := range capSet { 176 s.capabilities[i] = capLabels 177 } 178 } 179 180 // Compute spec metadata. 181 s.specMetadataLabels = containermetrics.ComputeSpecMetadata(allContainers) 182 183 s.sandbox = rootContainer.Sandbox 184 s.createdAt = rootContainer.CreatedAt 185 } 186 if s.verifier == nil { 187 registeredMetrics, err := s.sandbox.GetRegisteredMetrics() 188 if err != nil { 189 return nil, nil, err 190 } 191 verifier, cleanup, err := prometheus.NewVerifier(registeredMetrics) 192 if err != nil { 193 return nil, nil, err 194 } 195 s.verifier = verifier 196 s.cleanupVerifier = cleanup 197 } 198 if err := s.extra.load(s); err != nil { 199 return nil, nil, err 200 } 201 return s.sandbox, s.verifier, nil 202 } 203 204 func (s *servedSandbox) cleanup() { 205 s.mu.Lock() 206 defer s.mu.Unlock() 207 if s.cleanupVerifier != nil { 208 s.cleanupVerifier() 209 } 210 } 211 212 // querySandboxMetrics queries the sandbox for metrics data. 213 func querySandboxMetrics(ctx context.Context, sand *sandbox.Sandbox, verifier *prometheus.Verifier, metricsFilter string) (*prometheus.Snapshot, error) { 214 ch := make(chan struct { 215 snapshot *prometheus.Snapshot 216 err error 217 }, 1) 218 canceled := make(chan struct{}, 1) 219 defer close(canceled) 220 go func() { 221 snapshot, err := sand.ExportMetrics(control.MetricsExportOpts{ 222 OnlyMetrics: metricsFilter, 223 }) 224 select { 225 case <-canceled: 226 case ch <- struct { 227 snapshot *prometheus.Snapshot 228 err error 229 }{snapshot, err}: 230 close(ch) 231 } 232 }() 233 select { 234 case <-ctx.Done(): 235 canceled <- struct{}{} 236 return nil, ctx.Err() 237 case ret := <-ch: 238 if ret.err != nil { 239 return nil, ret.err 240 } 241 if err := verifier.Verify(ret.snapshot); err != nil { 242 return nil, err 243 } 244 return ret.snapshot, nil 245 } 246 } 247 248 // metricServer implements the metric server. 249 type metricServer struct { 250 rootDir string 251 pid int 252 pidFile string 253 allowUnknownRoot bool 254 exposeProfileEndpoints bool 255 address string 256 exporterPrefix string 257 startTime time.Time 258 srv http.Server 259 260 // Size of the map of written metrics during the last /metrics export. Initially zero. 261 // Used to efficiently reallocate a map of the right size during the next export. 262 lastMetricsWrittenSize atomicbitops.Uint32 263 264 // Pool of `prometheus.ReusableWriter`s. Used to avoid large buffer allocations for 265 // successive snapshots. 266 promWriterPool sync.Pool 267 268 // mu protects the fields below. 269 mu sync.Mutex 270 271 // udsPath is a path to a Unix Domain Socket file on which the server is bound and which it owns. 272 // This socket file will be deleted on server shutdown. 273 // This field is not set if binding to a network port, or when the UDS already existed prior to 274 // being bound by us (i.e. its ownership isn't ours), such that it isn't deleted in this case. 275 // The field is unset once the file is successfully removed. 276 udsPath string 277 278 // sandboxes is the list of sandboxes we serve metrics for. 279 sandboxes map[container.FullID]*servedSandbox 280 281 // lastStateFileStat maps container full IDs to the last observed stat() of their state file. 282 // This is used to monitor for sandboxes in the background. If a sandbox's state file matches this 283 // info, we can assume that the last background scan already looked at it. 284 lastStateFileStat map[container.FullID]os.FileInfo 285 286 // lastValidMetricFilter stores the last value of the "runsc-sandbox-metrics-filter" parameter for 287 // /metrics requests. 288 // It represents the last-known compilable regular expression that was passed to /metrics. 289 // It is used to avoid re-verifying this parameter in the common case where a single scraper 290 // is consistently passing in the same value for this parameter in each successive request. 291 lastValidMetricFilter string 292 293 // lastValidCapabilityFilterStr stores the last value of the "runsc-capability-filter" parameter 294 // for /metrics requests. 295 // It represents the last-known compilable regular expression that was passed to /metrics. 296 // It is used to avoid re-verifying this parameter in the common case where a single scraper 297 // is consistently passing in the same value for this parameter in each successive request. 298 lastValidCapabilityFilterStr string 299 300 // lastValidCapabilityFilterReg is the compiled regular expression corresponding to 301 // lastValidCapabilityFilterStr. 302 lastValidCapabilityFilterReg *regexp.Regexp 303 304 // numSandboxes counts the number of sandboxes that have ever been registered on this server. 305 // Used to distinguish between the case where this metrics serve has sat there doing nothing 306 // because no sandbox ever registered against it (which is unexpected), vs the case where it has 307 // done a good job serving sandbox metrics and it's time for it to gracefully die as there are no 308 // more sandboxes to serve. 309 // Also exported as a metric of total number of sandboxes started. 310 numSandboxes int64 311 312 // shuttingDown is flipped to true when the server shutdown process has started. 313 // Used to deal with race conditions where a sandbox is trying to register after the server has 314 // already started to go to sleep. 315 shuttingDown bool 316 317 // shutdownCh is written to when receiving the signal to shut down gracefully. 318 shutdownCh chan os.Signal 319 320 // extraData contains additional server-wide data. 321 extra serverData 322 } 323 324 // sufficientlyEqualStats returns whether the given FileInfo's are sufficiently 325 // equal to assume the file they represent has not changed between the time 326 // each FileInfo was obtained. 327 func sufficientlyEqualStats(s1, s2 os.FileInfo) bool { 328 if !s1.ModTime().Equal(s2.ModTime()) { 329 return false 330 } 331 if s1.Size() != s2.Size() { 332 return false 333 } 334 statT1, ok1 := s1.Sys().(*syscall.Stat_t) 335 statT2, ok2 := s2.Sys().(*syscall.Stat_t) 336 if ok1 != ok2 { 337 return false 338 } 339 if ok1 && ok2 { 340 if statT1.Dev != statT2.Dev { 341 return false 342 } 343 if statT1.Ino != statT2.Ino { 344 return false 345 } 346 } 347 return true 348 } 349 350 // refreshSandboxesLocked removes sandboxes that are no longer running from m.sandboxes, and 351 // adds sandboxes found in the root directory that do request instrumentation. 352 // Preconditions: m.mu is locked. 353 func (m *metricServer) refreshSandboxesLocked() { 354 if m.shuttingDown { 355 // Do nothing to avoid log spam. 356 return 357 } 358 sandboxIDs, err := container.ListSandboxes(m.rootDir) 359 if err != nil { 360 if !m.allowUnknownRoot { 361 log.Warningf("Cannot list containers in root directory %s, it has likely gone away: %v.", m.rootDir, err) 362 } 363 return 364 } 365 for sandboxID, sandbox := range m.sandboxes { 366 found := false 367 for _, sid := range sandboxIDs { 368 if sid == sandboxID { 369 found = true 370 break 371 } 372 } 373 if !found { 374 log.Warningf("Sandbox %s no longer exists but did not explicitly unregister. Removing it.", sandboxID) 375 sandbox.cleanup() 376 delete(m.sandboxes, sandboxID) 377 continue 378 } 379 if _, _, err := sandbox.load(); err != nil && err != container.ErrStateFileLocked { 380 log.Warningf("Sandbox %s cannot be loaded, deleting it: %v", sandboxID, err) 381 sandbox.cleanup() 382 delete(m.sandboxes, sandboxID) 383 continue 384 } 385 if !sandbox.sandbox.IsRunning() { 386 log.Infof("Sandbox %s is no longer running, deleting it.", sandboxID) 387 sandbox.cleanup() 388 delete(m.sandboxes, sandboxID) 389 continue 390 } 391 } 392 newSandboxIDs := make(map[container.FullID]bool, len(sandboxIDs)) 393 for _, sid := range sandboxIDs { 394 if _, found := m.sandboxes[sid]; found { 395 continue 396 } 397 newSandboxIDs[sid] = true 398 } 399 for sid := range m.lastStateFileStat { 400 if _, found := newSandboxIDs[sid]; !found { 401 delete(m.lastStateFileStat, sid) 402 } 403 } 404 for sid := range newSandboxIDs { 405 stateFile := container.StateFile{ 406 RootDir: m.rootDir, 407 ID: sid, 408 } 409 stat, err := stateFile.Stat() 410 if err != nil { 411 log.Warningf("Failed to stat() container state file for sandbox %q: %v", sid, err) 412 continue 413 } 414 if existing, found := m.lastStateFileStat[sid]; found { 415 // We already tried to stat this sandbox but decided not to pick it up. 416 // Check if the state file changed since. If it didn't, we don't want to 417 // try again. 418 if sufficientlyEqualStats(existing, stat) { 419 continue 420 } 421 log.Infof("State file for sandbox %q has changed since we last looked at it; will try to reload it.", sid) 422 delete(m.lastStateFileStat, sid) 423 } 424 // If we get here, we either haven't seen this sandbox before, or we saw it 425 // and it has disappeared (which means it is new in this iteration), or we 426 // saw it before but its state file changed. Either way, we want to try 427 // loading it and see if it wants instrumentation. 428 cont, err := container.Load(m.rootDir, sid, container.LoadOpts{ 429 Exact: true, 430 SkipCheck: true, 431 TryLock: container.TryAcquire, 432 RootContainer: true, 433 }) 434 if err != nil { 435 if err == container.ErrStateFileLocked { 436 // This error is OK and shouldn't generate log spam. The sandbox is probably in the middle 437 // of being created. 438 continue 439 } 440 log.Warningf("Cannot load state file for sandbox %q: %v", sid, err) 441 continue 442 } 443 444 // This is redundant with one of the checks performed below in servedSandbox.load, but this 445 // avoids log spam for the non-error case of sandboxes that didn't request instrumentation. 446 sandboxMetricAddr := strings.ReplaceAll(cont.Sandbox.MetricServerAddress, "%RUNTIME_ROOT%", m.rootDir) 447 if sandboxMetricAddr != m.address { 448 m.lastStateFileStat[sid] = stat 449 continue 450 } 451 452 // This case can be hit when there is a leftover state file for a sandbox that was `kill -9`'d 453 // without an opportunity for it to clean up its state file. This results in a valid state file 454 // but the sandbox PID is gone. We don't want to continuously load this sandbox's state file. 455 if cont.Status == container.Running && !cont.Sandbox.IsRunning() { 456 log.Warningf("Sandbox %q has state file in state Running, yet it isn't actually running. Ignoring it.", sid) 457 m.lastStateFileStat[sid] = stat 458 continue 459 } 460 461 m.numSandboxes++ 462 served := &servedSandbox{ 463 rootContainerID: sid, 464 server: m, 465 extraLabels: map[string]string{ 466 prometheus.SandboxIDLabel: sid.SandboxID, 467 }, 468 } 469 // Best-effort attempt to load the state file instantly. 470 // This may legitimately fail if it is locked, e.g. during sandbox startup. 471 // If it fails for any other reason, then the sandbox went away between the time we listed the 472 // sandboxes and now, so just delete it. 473 if _, _, err := served.load(); err != nil && err != container.ErrStateFileLocked { 474 log.Warningf("Sandbox %q cannot be loaded, ignoring it: %v", sid, err) 475 m.lastStateFileStat[sid] = stat 476 served.cleanup() 477 continue 478 } 479 m.sandboxes[sid] = served 480 log.Infof("Registered new sandbox found in root directory: %q", sid) 481 } 482 } 483 484 // sandboxLoadResult contains the outcome of calling `load` on a `servedSandbox`. 485 // It is used as an intermediary type that contains all that we know about a 486 // sandbox after attempting to load its state file, but does not contain any 487 // metric data from the sandbox. 488 type sandboxLoadResult struct { 489 served *servedSandbox 490 sandbox *sandbox.Sandbox 491 verifier *prometheus.Verifier 492 err error 493 } 494 495 // loadSandboxesLocked loads the state file data from all known sandboxes. 496 // It does so in parallel, and avoids reloading sandboxes for which we have 497 // already loaded data. 498 func (m *metricServer) loadSandboxesLocked(ctx context.Context) []sandboxLoadResult { 499 m.refreshSandboxesLocked() 500 501 numGoroutines := exportParallelGoroutines 502 numSandboxes := len(m.sandboxes) 503 if numSandboxes < numGoroutines { 504 numGoroutines = numSandboxes 505 } 506 507 // First, load all the sandboxes in parallel. We need to do this while m.mu is held. 508 loadSandboxCh := make(chan *servedSandbox, numSandboxes) 509 loadedSandboxesCh := make(chan sandboxLoadResult, numSandboxes) 510 loadedSandboxes := make([]sandboxLoadResult, 0, numSandboxes) 511 for i := 0; i < numGoroutines; i++ { 512 go func() { 513 for served := range loadSandboxCh { 514 sand, verifier, err := served.load() 515 loadedSandboxesCh <- sandboxLoadResult{served, sand, verifier, err} 516 } 517 }() 518 } 519 for _, sandbox := range m.sandboxes { 520 loadSandboxCh <- sandbox 521 } 522 close(loadSandboxCh) 523 for i := 0; i < numSandboxes; i++ { 524 loadedSandboxes = append(loadedSandboxes, <-loadedSandboxesCh) 525 } 526 close(loadedSandboxesCh) 527 return loadedSandboxes 528 } 529 530 // sandboxMetricsResult is the result of calling querySandboxMetrics on a 531 // single sandbox. It contains all of `sandboxLoadResult` but also has current 532 // metric data (if querying metrics from the sandbox process succeeded). 533 type sandboxMetricsResult struct { 534 sandboxLoadResult 535 isRunning bool 536 snapshot *prometheus.Snapshot 537 err error 538 } 539 540 // queryMultiSandboxMetrics queries metric data from multiple loaded sandboxes. 541 // It does so in parallel and with random permutation ordering. 542 // Only metrics matching the `metricsFilter` regular expression are queried. 543 // For each sandbox, whether we were successful in querying its metrics or 544 // not, the `processSandbox` function is called. This may be done in parallel, 545 // so `processSandbox` should do its own locking so that multiple parallel 546 // instances of itself behave appropriately. 547 func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoadResult, metricsFilter string, processSandbox func(sandboxMetricsResult)) { 548 numSandboxes := len(loadedSandboxes) 549 ctxDeadline, ok := ctx.Deadline() 550 if !ok { 551 panic("context had no deadline, this should never happen as it was created with a timeout") 552 } 553 exportStartTime := time.Now() 554 requestTimeLeft := ctxDeadline.Sub(exportStartTime) 555 perSandboxTime := requestTimeLeft 556 if numSandboxes != 0 { 557 perSandboxTime = requestTimeLeft / time.Duration(numSandboxes) 558 } 559 if perSandboxTime < metricsExportPerSandboxTimeout { 560 perSandboxTime = metricsExportPerSandboxTimeout 561 } 562 loadedSandboxCh := make(chan sandboxLoadResult, numSandboxes) 563 var wg sync.WaitGroup 564 numGoroutines := exportParallelGoroutines 565 if numSandboxes < numGoroutines { 566 numGoroutines = numSandboxes 567 } 568 wg.Add(numGoroutines) 569 for i := 0; i < numGoroutines; i++ { 570 go func() { 571 defer wg.Done() 572 for s := range loadedSandboxCh { 573 isRunning := false 574 var snapshot *prometheus.Snapshot 575 err := s.err 576 if err == nil { 577 queryCtx, queryCtxCancel := context.WithTimeout(ctx, perSandboxTime) 578 snapshot, err = querySandboxMetrics(queryCtx, s.sandbox, s.verifier, metricsFilter) 579 queryCtxCancel() 580 isRunning = s.sandbox.IsRunning() 581 } 582 processSandbox(sandboxMetricsResult{ 583 sandboxLoadResult: s, 584 isRunning: isRunning, 585 snapshot: snapshot, 586 err: err, 587 }) 588 } 589 }() 590 } 591 // Iterate over all sandboxes. 592 // Important: This must be done in random order. 593 // A malicious/compromised sandbox may decide to stall when being asked for metrics. 594 // If at least `numGoroutines` sandboxes do this, this will starve other sandboxes 595 // from having their metrics exported, because all the goroutines will be stuck on 596 // the stalled sandboxes. 597 // One way to completely avoid this would be to spawn one goroutine per 598 // sandbox, but this can amount to ~hundreds of goroutines, which is not desirable 599 // for the metrics server. 600 // Another way would be to have a very strict timeout on each sandbox's export 601 // process, but in some cases a busy sandbox will take more than a decisecond 602 // or so to export its data, so this would miss some data from legitimate (but 603 // slow) sandboxes. 604 // Instead, we take a middle-of-the-road approach: we use a timeout that's not 605 // too strict but still ensures we make forward progress away from stalled 606 // sandboxes, and we also iterate across sandboxes in a different random order at 607 // each export. This ensures that all sandboxes eventually get a fair chance of 608 // being part of the "first `numGoroutines` sandboxes in line" to get their 609 // metric data loaded, such that a client repeatedly scraping metrics will 610 // eventually get data from each sandbox. 611 for _, sandboxIndex := range rand.Perm(len(loadedSandboxes)) { 612 loadedSandboxCh <- loadedSandboxes[sandboxIndex] 613 } 614 close(loadedSandboxCh) 615 wg.Wait() 616 } 617 618 // serveMetrics serves metrics requests. 619 func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) httpResult { 620 ctx, ctxCancel := context.WithTimeout(req.Context(), metricsExportTimeout) 621 defer ctxCancel() 622 623 metricsFilter := req.URL.Query().Get("runsc-sandbox-metrics-filter") 624 var capabilityFilterReg *regexp.Regexp 625 capabilityFilterStr := req.URL.Query().Get("runsc-capability-filter") 626 627 m.mu.Lock() 628 629 if metricsFilter != "" && metricsFilter != m.lastValidMetricFilter { 630 _, err := regexp.Compile(metricsFilter) 631 if err != nil { 632 m.mu.Unlock() 633 return httpResult{http.StatusBadRequest, errors.New("provided metric filter is not a valid regular expression")} 634 } 635 m.lastValidMetricFilter = metricsFilter 636 } 637 if capabilityFilterStr != "" { 638 if capabilityFilterStr != m.lastValidCapabilityFilterStr { 639 reg, err := regexp.Compile(capabilityFilterStr) 640 if err != nil { 641 m.mu.Unlock() 642 return httpResult{http.StatusBadRequest, errors.New("provided capability filter is not a valid regular expression")} 643 } 644 m.lastValidCapabilityFilterStr = capabilityFilterStr 645 m.lastValidCapabilityFilterReg = reg 646 capabilityFilterReg = reg 647 } else { 648 capabilityFilterReg = m.lastValidCapabilityFilterReg 649 } 650 } 651 652 loadedSandboxes := m.loadSandboxesLocked(ctx) 653 numSandboxes := len(loadedSandboxes) 654 numSandboxesTotal := m.numSandboxes 655 m.mu.Unlock() 656 657 // Used to prevent goroutines from accessing the shared variables below. 658 var metricsMu sync.Mutex 659 660 // Meta-metrics keep track of metrics to export about the metrics server itself. 661 type metaMetrics struct { 662 numRunningSandboxes int64 663 numCannotExportSandboxes int64 664 } 665 meta := metaMetrics{} // Protected by metricsMu. 666 selfMetrics := prometheus.NewSnapshot() // Protected by metricsMu. 667 668 type snapshotAndOptions struct { 669 snapshot *prometheus.Snapshot 670 options prometheus.SnapshotExportOptions 671 } 672 snapshotCh := make(chan snapshotAndOptions, numSandboxes) 673 674 queryMultiSandboxMetrics(ctx, loadedSandboxes, metricsFilter, func(r sandboxMetricsResult) { 675 metricsMu.Lock() 676 defer metricsMu.Unlock() 677 selfMetrics.Add(prometheus.LabeledIntData(&SandboxPresenceMetric, nil, 1).SetExternalLabels(r.served.extraLabels)) 678 sandboxRunning := int64(0) 679 if r.isRunning { 680 sandboxRunning = 1 681 meta.numRunningSandboxes++ 682 } 683 selfMetrics.Add(prometheus.LabeledIntData(&SandboxRunningMetric, nil, sandboxRunning).SetExternalLabels(r.served.extraLabels)) 684 if r.err == nil { 685 selfMetrics.Add(prometheus.LabeledIntData(&SandboxMetadataMetric, r.sandbox.MetricMetadata, 1).SetExternalLabels(r.served.extraLabels)) 686 for _, cap := range r.served.capabilities { 687 if capabilityFilterReg != nil && !capabilityFilterReg.MatchString(cap.String()) && !capabilityFilterReg.MatchString(cap.TrimmedString()) { 688 continue 689 } 690 selfMetrics.Add(prometheus.LabeledIntData(&SandboxCapabilitiesMetric, map[string]string{ 691 SandboxCapabilitiesMetricLabel: cap.TrimmedString(), 692 }, 1).SetExternalLabels(r.served.extraLabels)) 693 } 694 selfMetrics.Add(prometheus.LabeledIntData(&SpecMetadataMetric, r.served.specMetadataLabels, 1).SetExternalLabels(r.served.extraLabels)) 695 createdAt := float64(r.served.createdAt.Unix()) + (float64(r.served.createdAt.Nanosecond()) / 1e9) 696 selfMetrics.Add(prometheus.LabeledFloatData(&SandboxCreationMetric, nil, createdAt).SetExternalLabels(r.served.extraLabels)) 697 } else { 698 // If the sandbox isn't running, it is normal that metrics are not exported for it, so 699 // do not report this case as an error. 700 if r.isRunning { 701 meta.numCannotExportSandboxes++ 702 log.Warningf("Could not export metrics from sandbox %s: %v", r.served.rootContainerID.SandboxID, r.err) 703 } 704 return 705 } 706 snapshotCh <- snapshotAndOptions{ 707 snapshot: r.snapshot, 708 options: prometheus.SnapshotExportOptions{ 709 ExporterPrefix: m.exporterPrefix, 710 ExtraLabels: r.served.extraLabels, 711 }, 712 } 713 }) 714 715 // Build the map of all snapshots we will be rendering. 716 snapshotsToOptions := make(map[*prometheus.Snapshot]prometheus.SnapshotExportOptions, numSandboxes+2) 717 snapshotsToOptions[selfMetrics] = prometheus.SnapshotExportOptions{ 718 ExporterPrefix: fmt.Sprintf("%s%s", m.exporterPrefix, prometheus.MetaMetricPrefix), 719 } 720 processMetrics := prometheus.NewSnapshot() 721 processMetrics.Add(prometheus.NewFloatData(&prometheus.ProcessStartTimeSeconds, float64(m.startTime.Unix())+(float64(m.startTime.Nanosecond())/1e9))) 722 snapshotsToOptions[processMetrics] = prometheus.SnapshotExportOptions{ 723 // These metrics must be written without any prefix. 724 } 725 726 // Aggregate all the snapshots from the sandboxes. 727 close(snapshotCh) 728 for snapshotAndOptions := range snapshotCh { 729 snapshotsToOptions[snapshotAndOptions.snapshot] = snapshotAndOptions.options 730 } 731 732 // Add our own metrics. 733 selfMetrics.Add(prometheus.NewIntData(&NumRunningSandboxesMetric, meta.numRunningSandboxes)) 734 selfMetrics.Add(prometheus.NewIntData(&NumCannotExportSandboxesMetric, meta.numCannotExportSandboxes)) 735 selfMetrics.Add(prometheus.NewIntData(&NumTotalSandboxesMetric, numSandboxesTotal)) 736 737 // Write out all data. 738 lastMetricsWrittenSize := int(m.lastMetricsWrittenSize.Load()) 739 metricsWritten := make(map[string]bool, lastMetricsWrittenSize) 740 commentHeader := fmt.Sprintf("Data for runsc metric server exporting data for sandboxes in root directory %s", m.rootDir) 741 if metricsFilter != "" { 742 commentHeader = fmt.Sprintf("%s (filtered using regular expression: %q)", commentHeader, metricsFilter) 743 } 744 promWriter := m.promWriterPool.Get().(*prometheus.ReusableWriter[*httpResponseWriter]) 745 written, err := promWriter.Write(w, prometheus.ExportOptions{ 746 CommentHeader: commentHeader, 747 MetricsWritten: metricsWritten, 748 }, snapshotsToOptions) 749 m.promWriterPool.Put(promWriter) 750 if err != nil { 751 if written == 0 { 752 return httpResult{http.StatusServiceUnavailable, err} 753 } 754 // Note that we cannot return an HTTP error here because we have already started writing a 755 // response, which means we've already responded with a 200 OK status code. 756 // This probably means the client closed the connection before we could finish writing. 757 return httpOK 758 } 759 if lastMetricsWrittenSize < len(metricsWritten) { 760 m.lastMetricsWrittenSize.CompareAndSwap(uint32(lastMetricsWrittenSize), uint32(len(metricsWritten))) 761 } 762 return httpOK 763 } 764 765 // serveHealthCheck serves the healthcheck endpoint. 766 // Returns a response prefixed by "runsc-metrics:OK" on success. 767 // Clients can use this to assert that they are talking to the metrics server, as opposed to some 768 // other random HTTP server. 769 func (m *metricServer) serveHealthCheck(w *httpResponseWriter, req *http.Request) httpResult { 770 m.mu.Lock() 771 defer m.mu.Unlock() 772 if m.shuttingDown { 773 return httpResult{http.StatusServiceUnavailable, errors.New("server is shutting down")} 774 } 775 if err := req.ParseForm(); err != nil { 776 return httpResult{http.StatusBadRequest, err} 777 } 778 rootDir := req.Form.Get("root") 779 if rootDir != m.rootDir { 780 return httpResult{http.StatusBadRequest, fmt.Errorf("this metric server is configured to serve root directory: %s", m.rootDir)} 781 } 782 w.WriteHeader(http.StatusOK) 783 w.WriteString("runsc-metrics:OK") 784 return httpOK 785 } 786 787 // servePID serves the PID of the metric server process. 788 func (m *metricServer) servePID(w *httpResponseWriter, req *http.Request) httpResult { 789 m.mu.Lock() 790 defer m.mu.Unlock() 791 if m.shuttingDown { 792 return httpResult{http.StatusServiceUnavailable, errors.New("server is shutting down")} 793 } 794 w.WriteString(strconv.Itoa(m.pid)) 795 return httpOK 796 } 797 798 // Server is the set of options to run a metric server. 799 // Initialize this struct and then call Run on it to run the metric server. 800 type Server struct { 801 // Config is the main runsc configuration. 802 Config *config.Config 803 804 // ExporterPrefix is used as prefix for all metric names following Prometheus exporter convention. 805 ExporterPrefix string 806 807 // PIDFile, if set, will cause the metric server to write its own PID to this file after binding 808 // to the requested address. The parent directory of this file must already exist. 809 PIDFile string 810 811 // ExposeProfileEndpoints, if true, exposes /runsc-metrics/profile-cpu and 812 // /runsc-metrics/profile-heap to get profiling data about the metric server. 813 ExposeProfileEndpoints bool 814 815 // AllowUnknownRoot causes the metric server to keep running regardless of the existence of the 816 // Config's root directory or the metric server's ability to access it. 817 AllowUnknownRoot bool 818 } 819 820 // Run runs the metric server. 821 // It blocks until the server is instructed to exit, e.g. via signal. 822 func (s *Server) Run(ctx context.Context) error { 823 ctx, ctxCancel := context.WithCancel(ctx) 824 defer ctxCancel() 825 826 m := &metricServer{ 827 exporterPrefix: s.ExporterPrefix, 828 pidFile: s.PIDFile, 829 exposeProfileEndpoints: s.ExposeProfileEndpoints, 830 allowUnknownRoot: s.AllowUnknownRoot, 831 promWriterPool: sync.Pool{ 832 New: func() any { 833 return &prometheus.ReusableWriter[*httpResponseWriter]{} 834 }, 835 }, 836 } 837 conf := s.Config 838 if conf.MetricServer == "" { 839 return errors.New("config does not specify the metric server address (--metric-server)") 840 } 841 if strings.Contains(conf.MetricServer, "%ID%") { 842 return fmt.Errorf("metric server address contains '%%ID%%': %v; this should have been replaced by the parent process", conf.MetricServer) 843 } 844 if _, err := container.ListSandboxes(conf.RootDir); err != nil { 845 if !m.allowUnknownRoot { 846 return fmt.Errorf("invalid root directory %q: tried to list sandboxes within it and got: %w", conf.RootDir, err) 847 } 848 log.Infof("Root directory %q: tried to list sandboxes within it and got: %v. Continuing anyway, as this is expected with --allow-unknown-root.", conf.RootDir, err) 849 } 850 // container.ListSandboxes uses a glob pattern, which doesn't error out on 851 // permission errors. Double-check by actually listing the directory. 852 if _, err := ioutil.ReadDir(conf.RootDir); err != nil { 853 if !m.allowUnknownRoot { 854 return fmt.Errorf("invalid root directory %q: tried to list all entries within it and got: %w", conf.RootDir, err) 855 } 856 log.Infof("Root directory %q: tried to list all entries within it and got: %v. Continuing anyway, as this is expected with --allow-unknown-root.", conf.RootDir, err) 857 } 858 m.startTime = time.Now() 859 m.rootDir = conf.RootDir 860 if strings.Contains(conf.MetricServer, "%RUNTIME_ROOT%") { 861 newAddr := strings.ReplaceAll(conf.MetricServer, "%RUNTIME_ROOT%", m.rootDir) 862 log.Infof("Metric server address replaced %RUNTIME_ROOT%: %q -> %q", conf.MetricServer, newAddr) 863 conf.MetricServer = newAddr 864 } 865 m.address = conf.MetricServer 866 m.sandboxes = make(map[container.FullID]*servedSandbox) 867 m.lastStateFileStat = make(map[container.FullID]os.FileInfo) 868 m.pid = os.Getpid() 869 m.shutdownCh = make(chan os.Signal, 1) 870 signal.Notify(m.shutdownCh, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) 871 872 var listener net.Listener 873 var listenErr error 874 if strings.HasPrefix(conf.MetricServer, fmt.Sprintf("%c", os.PathSeparator)) { 875 beforeBindSt, beforeBindErr := os.Stat(conf.MetricServer) 876 if listener, listenErr = (&net.ListenConfig{}).Listen(ctx, "unix", conf.MetricServer); listenErr != nil { 877 return fmt.Errorf("cannot listen on unix domain socket %q: %w", conf.MetricServer, listenErr) 878 } 879 afterBindSt, afterBindErr := os.Stat(conf.MetricServer) 880 if afterBindErr != nil { 881 return fmt.Errorf("cannot stat our own unix domain socket %q: %w", conf.MetricServer, afterBindErr) 882 } 883 ownUDS := true 884 if beforeBindErr == nil && beforeBindSt.Mode() == afterBindSt.Mode() { 885 // Socket file existed and was a socket prior to us binding to it. 886 if beforeBindSt.Sys() != nil && afterBindSt.Sys() != nil { 887 beforeSt, beforeStOk := beforeBindSt.Sys().(*syscall.Stat_t) 888 afterSt, afterStOk := beforeBindSt.Sys().(*syscall.Stat_t) 889 if beforeStOk && afterStOk && beforeSt.Dev == afterSt.Dev && beforeSt.Ino == afterSt.Ino { 890 // Socket file is the same before and after binding, so we should not consider ourselves 891 // the owner of it. 892 ownUDS = false 893 } 894 } 895 } 896 if ownUDS { 897 log.Infof("Bound on socket file %s which we own. As such, this socket file will be deleted on server shutdown.", conf.MetricServer) 898 m.udsPath = conf.MetricServer 899 defer os.Remove(m.udsPath) 900 os.Chmod(m.udsPath, 0777) 901 } else { 902 log.Infof("Bound on socket file %s which existed prior to this server's existence. As such, it will not be deleted on server shutdown.", conf.MetricServer) 903 } 904 } else { 905 if strings.HasPrefix(conf.MetricServer, ":") { 906 log.Warningf("Binding on all interfaces. This will allow anyone to list all containers on your machine!") 907 } 908 if listener, listenErr = (&net.ListenConfig{}).Listen(ctx, "tcp", conf.MetricServer); listenErr != nil { 909 return fmt.Errorf("cannot listen on TCP address %q: %w", conf.MetricServer, listenErr) 910 } 911 } 912 913 mux := http.NewServeMux() 914 mux.HandleFunc("/runsc-metrics/healthcheck", logRequest(m.serveHealthCheck)) 915 mux.HandleFunc("/runsc-metrics/pid", logRequest(m.servePID)) 916 if m.exposeProfileEndpoints { 917 log.Warningf("Profiling HTTP endpoints are exposed; this should only be used for development!") 918 mux.HandleFunc("/runsc-metrics/profile-cpu", logRequest(m.profileCPU)) 919 mux.HandleFunc("/runsc-metrics/profile-heap", logRequest(m.profileHeap)) 920 } else { 921 // Disable memory profiling, since we don't expose it. 922 runtime.MemProfileRate = 0 923 } 924 mux.HandleFunc("/metrics", logRequest(m.serveMetrics)) 925 mux.HandleFunc("/", logRequest(m.serveIndex)) 926 m.srv.Handler = mux 927 m.srv.ReadTimeout = httpTimeout 928 m.srv.WriteTimeout = httpTimeout 929 if err := m.startVerifyLoop(ctx); err != nil { 930 return fmt.Errorf("cannot start background loop: %w", err) 931 } 932 if m.pidFile != "" { 933 if err := ioutil.WriteFile(m.pidFile, []byte(fmt.Sprintf("%d", m.pid)), 0644); err != nil { 934 return fmt.Errorf("cannot write PID to file %q: %w", m.pidFile, err) 935 } 936 defer os.Remove(m.pidFile) 937 log.Infof("Wrote PID %d to file %v.", m.pid, m.pidFile) 938 } 939 940 // If not modified by the user from the environment, set the Go GC percentage lower than default. 941 if _, hasEnv := os.LookupEnv("GOGC"); !hasEnv { 942 debug.SetGCPercent(40) 943 } 944 945 // Run GC immediately to get rid of all the initialization-related memory bloat and start from 946 // a clean slate. 947 state.Release() 948 runtime.GC() 949 950 // Initialization complete. 951 log.Infof("Server serving on %s for root directory %s.", conf.MetricServer, conf.RootDir) 952 serveErr := m.srv.Serve(listener) 953 log.Infof("Server has stopped accepting requests.") 954 m.mu.Lock() 955 defer m.mu.Unlock() 956 if serveErr != nil { 957 if serveErr == http.ErrServerClosed { 958 return nil 959 } 960 return fmt.Errorf("cannot serve on address %s: %w", conf.MetricServer, serveErr) 961 } 962 // Per documentation, http.Server.Serve can never return a nil error, so this is not a success. 963 return fmt.Errorf("HTTP server Serve() did not return expected error") 964 }