github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/runsc/metricserver/metricserver.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package metricserver implements a Prometheus metric server for runsc data. 16 package metricserver 17 18 import ( 19 "context" 20 "errors" 21 "fmt" 22 "io" 23 "io/ioutil" 24 "math/rand" 25 "net" 26 "net/http" 27 "os" 28 "os/signal" 29 "regexp" 30 "runtime" 31 "runtime/debug" 32 "strconv" 33 "strings" 34 "syscall" 35 "time" 36 37 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 38 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 39 "github.com/nicocha30/gvisor-ligolo/pkg/log" 40 "github.com/nicocha30/gvisor-ligolo/pkg/prometheus" 41 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/control" 42 "github.com/nicocha30/gvisor-ligolo/pkg/state" 43 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 44 "github.com/nicocha30/gvisor-ligolo/runsc/config" 45 "github.com/nicocha30/gvisor-ligolo/runsc/container" 46 "github.com/nicocha30/gvisor-ligolo/runsc/sandbox" 47 ) 48 49 const ( 50 // metricsExportTimeout is the maximum amount of time that the metrics export process should take. 51 metricsExportTimeout = 30 * time.Second 52 53 // metricsExportPerSandboxTimeout is the maximum amount of time that we wait on any individual 54 // sandbox when exporting its metrics. 55 metricsExportPerSandboxTimeout = 8 * time.Second 56 57 // exportParallelGoroutines is the maximum number of goroutines spawned during metrics export. 58 exportParallelGoroutines = 8 59 ) 60 61 // servedSandbox is a sandbox that we serve metrics from. 62 // A single metrics server will export data about multiple sandboxes. 63 type servedSandbox struct { 64 rootContainerID container.FullID 65 server *metricServer 66 extraLabels map[string]string 67 68 // mu protects the fields below. 69 mu sync.Mutex 70 71 // sandbox is the sandbox being monitored. 72 // Once set, it is immutable. 73 sandbox *sandbox.Sandbox 74 75 // createdAt stores the time the sandbox was created. 76 // It is loaded from the container state file. 77 // Once set, it is immutable. 78 createdAt time.Time 79 80 // capabilities is the union of the capability set of the containers within `sandbox`. 81 // It is used to export a per-sandbox metric representing which capabilities are in use. 82 // For monitoring purposes, a capability added in a container means it is considered 83 // added for the whole sandbox. 84 capabilities []linux.Capability 85 86 // specMetadataLabels is the set of label exported as part of the 87 // `spec_metadata` metric. 88 specMetadataLabels map[string]string 89 90 // verifier allows verifying the data integrity of the metrics we get from this sandbox. 91 // It is not always initialized when the sandbox is discovered, but rather upon first metrics 92 // access to the sandbox. Metric registration data is loaded from the root container's 93 // state file. 94 // The server needs to load this registration data before any data from this sandbox is 95 // served to HTTP clients. If there is no metric registration data within the Container 96 // data, then metrics were not requested for this sandbox, and this servedSandbox should 97 // be deleted from the server. 98 // Once set, it is immutable. 99 verifier *prometheus.Verifier 100 101 // cleanupVerifier holds a reference to the cleanup function of the verifier. 102 cleanupVerifier func() 103 104 // extra contains additional per-sandbox data. 105 extra sandboxData 106 } 107 108 // load loads the sandbox being monitored and initializes its metric verifier. 109 // If it returns an error other than container.ErrStateFileLocked, the sandbox is either 110 // non-existent, or has not requested instrumentation to be enabled, or does not have 111 // valid metric registration data. In any of these cases, the sandbox should be removed 112 // from this metrics server. 113 func (s *servedSandbox) load() (*sandbox.Sandbox, *prometheus.Verifier, error) { 114 s.mu.Lock() 115 defer s.mu.Unlock() 116 if s.sandbox == nil { 117 allContainers, err := container.LoadSandbox(s.server.rootDir, s.rootContainerID.SandboxID, container.LoadOpts{ 118 TryLock: container.TryAcquire, 119 }) 120 if err != nil { 121 return nil, nil, fmt.Errorf("cannot load sandbox %q: %v", s.rootContainerID.SandboxID, err) 122 } 123 var rootContainer *container.Container 124 for _, cont := range allContainers { 125 if cont.IsSandboxRoot() { 126 if rootContainer != nil { 127 return nil, nil, fmt.Errorf("multiple root contains found for sandbox ID %q: %v and %v", s.rootContainerID.SandboxID, cont, rootContainer) 128 } 129 rootContainer = cont 130 } 131 } 132 if rootContainer == nil { 133 return nil, nil, fmt.Errorf("no root container found for sandbox ID %q", s.rootContainerID.SandboxID) 134 } 135 sandboxMetricAddr := strings.ReplaceAll(rootContainer.Sandbox.MetricServerAddress, "%RUNTIME_ROOT%", s.server.rootDir) 136 if sandboxMetricAddr == "" { 137 return nil, nil, errors.New("sandbox did not request instrumentation") 138 } 139 if sandboxMetricAddr != s.server.address { 140 return nil, nil, fmt.Errorf("sandbox requested instrumentation by a metric server running at a different address (sandbox wants %q, this metric server serves %q)", sandboxMetricAddr, s.server.address) 141 } 142 // Update label data as read from the state file. 143 // Do not store empty labels. 144 authoritativeLabels, err := SandboxPrometheusLabels(rootContainer) 145 if err != nil { 146 return nil, nil, fmt.Errorf("cannot compute Prometheus labels of sandbox: %v", err) 147 } 148 s.extraLabels = make(map[string]string, len(authoritativeLabels)) 149 for _, label := range []string{ 150 prometheus.SandboxIDLabel, 151 prometheus.IterationIDLabel, 152 prometheus.PodNameLabel, 153 prometheus.NamespaceLabel, 154 } { 155 s.extraLabels[label] = authoritativeLabels[label] 156 if s.extraLabels[label] == "" { 157 delete(s.extraLabels, label) 158 } 159 } 160 161 // Compute capability set. 162 allCaps := linux.AllCapabilities() 163 capSet := make([]linux.Capability, 0, len(allCaps)) 164 for _, cap := range allCaps { 165 for _, cont := range allContainers { 166 if cont.HasCapabilityInAnySet(cap) { 167 capSet = append(capSet, cap) 168 break 169 } 170 } 171 } 172 if len(capSet) > 0 { 173 // Reallocate a slice with minimum size, since it will be long-lived. 174 s.capabilities = make([]linux.Capability, len(capSet)) 175 for i, capLabels := range capSet { 176 s.capabilities[i] = capLabels 177 } 178 } 179 180 // Compute spec metadata. 181 s.specMetadataLabels = ComputeSpecMetadata(allContainers) 182 183 s.sandbox = rootContainer.Sandbox 184 s.createdAt = rootContainer.CreatedAt 185 } 186 if s.verifier == nil { 187 registeredMetrics, err := s.sandbox.GetRegisteredMetrics() 188 if err != nil { 189 return nil, nil, err 190 } 191 verifier, cleanup, err := prometheus.NewVerifier(registeredMetrics) 192 if err != nil { 193 return nil, nil, err 194 } 195 s.verifier = verifier 196 s.cleanupVerifier = cleanup 197 } 198 if err := s.extra.load(s); err != nil { 199 return nil, nil, err 200 } 201 return s.sandbox, s.verifier, nil 202 } 203 204 func (s *servedSandbox) cleanup() { 205 s.mu.Lock() 206 defer s.mu.Unlock() 207 if s.cleanupVerifier != nil { 208 s.cleanupVerifier() 209 } 210 } 211 212 // querySandboxMetrics queries the sandbox for metrics data. 213 func querySandboxMetrics(ctx context.Context, sand *sandbox.Sandbox, verifier *prometheus.Verifier, metricsFilter string) (*prometheus.Snapshot, error) { 214 ch := make(chan struct { 215 snapshot *prometheus.Snapshot 216 err error 217 }, 1) 218 canceled := make(chan struct{}, 1) 219 defer close(canceled) 220 go func() { 221 snapshot, err := sand.ExportMetrics(control.MetricsExportOpts{ 222 OnlyMetrics: metricsFilter, 223 }) 224 select { 225 case <-canceled: 226 case ch <- struct { 227 snapshot *prometheus.Snapshot 228 err error 229 }{snapshot, err}: 230 close(ch) 231 } 232 }() 233 select { 234 case <-ctx.Done(): 235 canceled <- struct{}{} 236 return nil, ctx.Err() 237 case ret := <-ch: 238 if ret.err != nil { 239 return nil, ret.err 240 } 241 if err := verifier.Verify(ret.snapshot); err != nil { 242 return nil, err 243 } 244 return ret.snapshot, nil 245 } 246 } 247 248 // metricServer implements the metric server. 249 type metricServer struct { 250 rootDir string 251 pid int 252 pidFile string 253 allowUnknownRoot bool 254 exposeProfileEndpoints bool 255 address string 256 exporterPrefix string 257 startTime time.Time 258 srv http.Server 259 260 // Size of the map of written metrics during the last /metrics export. Initially zero. 261 // Used to efficiently reallocate a map of the right size during the next export. 262 lastMetricsWrittenSize atomicbitops.Uint32 263 264 // mu protects the fields below. 265 mu sync.Mutex 266 267 // udsPath is a path to a Unix Domain Socket file on which the server is bound and which it owns. 268 // This socket file will be deleted on server shutdown. 269 // This field is not set if binding to a network port, or when the UDS already existed prior to 270 // being bound by us (i.e. its ownership isn't ours), such that it isn't deleted in this case. 271 // The field is unset once the file is successfully removed. 272 udsPath string 273 274 // sandboxes is the list of sandboxes we serve metrics for. 275 sandboxes map[container.FullID]*servedSandbox 276 277 // lastStateFileStat maps container full IDs to the last observed stat() of their state file. 278 // This is used to monitor for sandboxes in the background. If a sandbox's state file matches this 279 // info, we can assume that the last background scan already looked at it. 280 lastStateFileStat map[container.FullID]os.FileInfo 281 282 // lastValidMetricFilter stores the last value of the "runsc-sandbox-metrics-filter" parameter for 283 // /metrics requests. 284 // It represents the last-known compilable regular expression that was passed to /metrics. 285 // It is used to avoid re-verifying this parameter in the common case where a single scraper 286 // is consistently passing in the same value for this parameter in each successive request. 287 lastValidMetricFilter string 288 289 // lastValidCapabilityFilterStr stores the last value of the "runsc-capability-filter" parameter 290 // for /metrics requests. 291 // It represents the last-known compilable regular expression that was passed to /metrics. 292 // It is used to avoid re-verifying this parameter in the common case where a single scraper 293 // is consistently passing in the same value for this parameter in each successive request. 294 lastValidCapabilityFilterStr string 295 296 // lastValidCapabilityFilterReg is the compiled regular expression corresponding to 297 // lastValidCapabilityFilterStr. 298 lastValidCapabilityFilterReg *regexp.Regexp 299 300 // numSandboxes counts the number of sandboxes that have ever been registered on this server. 301 // Used to distinguish between the case where this metrics serve has sat there doing nothing 302 // because no sandbox ever registered against it (which is unexpected), vs the case where it has 303 // done a good job serving sandbox metrics and it's time for it to gracefully die as there are no 304 // more sandboxes to serve. 305 // Also exported as a metric of total number of sandboxes started. 306 numSandboxes int64 307 308 // shuttingDown is flipped to true when the server shutdown process has started. 309 // Used to deal with race conditions where a sandbox is trying to register after the server has 310 // already started to go to sleep. 311 shuttingDown bool 312 313 // shutdownCh is written to when receiving the signal to shut down gracefully. 314 shutdownCh chan os.Signal 315 316 // extraData contains additional server-wide data. 317 extra serverData 318 } 319 320 // sufficientlyEqualStats returns whether the given FileInfo's are sufficiently 321 // equal to assume the file they represent has not changed between the time 322 // each FileInfo was obtained. 323 func sufficientlyEqualStats(s1, s2 os.FileInfo) bool { 324 if !s1.ModTime().Equal(s2.ModTime()) { 325 return false 326 } 327 if s1.Size() != s2.Size() { 328 return false 329 } 330 statT1, ok1 := s1.Sys().(*syscall.Stat_t) 331 statT2, ok2 := s2.Sys().(*syscall.Stat_t) 332 if ok1 != ok2 { 333 return false 334 } 335 if ok1 && ok2 { 336 if statT1.Dev != statT2.Dev { 337 return false 338 } 339 if statT1.Ino != statT2.Ino { 340 return false 341 } 342 } 343 return true 344 } 345 346 // refreshSandboxesLocked removes sandboxes that are no longer running from m.sandboxes, and 347 // adds sandboxes found in the root directory that do request instrumentation. 348 // Preconditions: m.mu is locked. 349 func (m *metricServer) refreshSandboxesLocked() { 350 if m.shuttingDown { 351 // Do nothing to avoid log spam. 352 return 353 } 354 sandboxIDs, err := container.ListSandboxes(m.rootDir) 355 if err != nil { 356 if !m.allowUnknownRoot { 357 log.Warningf("Cannot list containers in root directory %s, it has likely gone away: %v.", m.rootDir, err) 358 } 359 return 360 } 361 for sandboxID, sandbox := range m.sandboxes { 362 found := false 363 for _, sid := range sandboxIDs { 364 if sid == sandboxID { 365 found = true 366 break 367 } 368 } 369 if !found { 370 log.Warningf("Sandbox %s no longer exists but did not explicitly unregister. Removing it.", sandboxID) 371 sandbox.cleanup() 372 delete(m.sandboxes, sandboxID) 373 continue 374 } 375 if _, _, err := sandbox.load(); err != nil && err != container.ErrStateFileLocked { 376 log.Warningf("Sandbox %s cannot be loaded, deleting it: %v", sandboxID, err) 377 sandbox.cleanup() 378 delete(m.sandboxes, sandboxID) 379 continue 380 } 381 if !sandbox.sandbox.IsRunning() { 382 log.Infof("Sandbox %s is no longer running, deleting it.", sandboxID) 383 sandbox.cleanup() 384 delete(m.sandboxes, sandboxID) 385 continue 386 } 387 } 388 newSandboxIDs := make(map[container.FullID]bool, len(sandboxIDs)) 389 for _, sid := range sandboxIDs { 390 if _, found := m.sandboxes[sid]; found { 391 continue 392 } 393 newSandboxIDs[sid] = true 394 } 395 for sid := range m.lastStateFileStat { 396 if _, found := newSandboxIDs[sid]; !found { 397 delete(m.lastStateFileStat, sid) 398 } 399 } 400 for sid := range newSandboxIDs { 401 stateFile := container.StateFile{ 402 RootDir: m.rootDir, 403 ID: sid, 404 } 405 stat, err := stateFile.Stat() 406 if err != nil { 407 log.Warningf("Failed to stat() container state file for sandbox %q: %v", sid, err) 408 continue 409 } 410 if existing, found := m.lastStateFileStat[sid]; found { 411 // We already tried to stat this sandbox but decided not to pick it up. 412 // Check if the state file changed since. If it didn't, we don't want to 413 // try again. 414 if sufficientlyEqualStats(existing, stat) { 415 continue 416 } 417 log.Infof("State file for sandbox %q has changed since we last looked at it; will try to reload it.", sid) 418 delete(m.lastStateFileStat, sid) 419 } 420 // If we get here, we either haven't seen this sandbox before, or we saw it 421 // and it has disappeared (which means it is new in this iteration), or we 422 // saw it before but its state file changed. Either way, we want to try 423 // loading it and see if it wants instrumentation. 424 cont, err := container.Load(m.rootDir, sid, container.LoadOpts{ 425 Exact: true, 426 SkipCheck: true, 427 TryLock: container.TryAcquire, 428 RootContainer: true, 429 }) 430 if err != nil { 431 if err == container.ErrStateFileLocked { 432 // This error is OK and shouldn't generate log spam. The sandbox is probably in the middle 433 // of being created. 434 continue 435 } 436 log.Warningf("Cannot load state file for sandbox %q: %v", sid, err) 437 continue 438 } 439 440 // This is redundant with one of the checks performed below in servedSandbox.load, but this 441 // avoids log spam for the non-error case of sandboxes that didn't request instrumentation. 442 sandboxMetricAddr := strings.ReplaceAll(cont.Sandbox.MetricServerAddress, "%RUNTIME_ROOT%", m.rootDir) 443 if sandboxMetricAddr != m.address { 444 m.lastStateFileStat[sid] = stat 445 continue 446 } 447 448 // This case can be hit when there is a leftover state file for a sandbox that was `kill -9`'d 449 // without an opportunity for it to clean up its state file. This results in a valid state file 450 // but the sandbox PID is gone. We don't want to continuously load this sandbox's state file. 451 if cont.Status == container.Running && !cont.Sandbox.IsRunning() { 452 log.Warningf("Sandbox %q has state file in state Running, yet it isn't actually running. Ignoring it.", sid) 453 m.lastStateFileStat[sid] = stat 454 continue 455 } 456 457 m.numSandboxes++ 458 served := &servedSandbox{ 459 rootContainerID: sid, 460 server: m, 461 extraLabels: map[string]string{ 462 prometheus.SandboxIDLabel: sid.SandboxID, 463 }, 464 } 465 // Best-effort attempt to load the state file instantly. 466 // This may legitimately fail if it is locked, e.g. during sandbox startup. 467 // If it fails for any other reason, then the sandbox went away between the time we listed the 468 // sandboxes and now, so just delete it. 469 if _, _, err := served.load(); err != nil && err != container.ErrStateFileLocked { 470 log.Warningf("Sandbox %q cannot be loaded, ignoring it: %v", sid, err) 471 m.lastStateFileStat[sid] = stat 472 served.cleanup() 473 continue 474 } 475 m.sandboxes[sid] = served 476 log.Infof("Registered new sandbox found in root directory: %q", sid) 477 } 478 } 479 480 // sandboxLoadResult contains the outcome of calling `load` on a `servedSandbox`. 481 // It is used as an intermediary type that contains all that we know about a 482 // sandbox after attempting to load its state file, but does not contain any 483 // metric data from the sandbox. 484 type sandboxLoadResult struct { 485 served *servedSandbox 486 sandbox *sandbox.Sandbox 487 verifier *prometheus.Verifier 488 err error 489 } 490 491 // loadSandboxesLocked loads the state file data from all known sandboxes. 492 // It does so in parallel, and avoids reloading sandboxes for which we have 493 // already loaded data. 494 func (m *metricServer) loadSandboxesLocked(ctx context.Context) []sandboxLoadResult { 495 m.refreshSandboxesLocked() 496 497 numGoroutines := exportParallelGoroutines 498 numSandboxes := len(m.sandboxes) 499 if numSandboxes < numGoroutines { 500 numGoroutines = numSandboxes 501 } 502 503 // First, load all the sandboxes in parallel. We need to do this while m.mu is held. 504 loadSandboxCh := make(chan *servedSandbox, numSandboxes) 505 loadedSandboxesCh := make(chan sandboxLoadResult, numSandboxes) 506 loadedSandboxes := make([]sandboxLoadResult, 0, numSandboxes) 507 for i := 0; i < numGoroutines; i++ { 508 go func() { 509 for served := range loadSandboxCh { 510 sand, verifier, err := served.load() 511 loadedSandboxesCh <- sandboxLoadResult{served, sand, verifier, err} 512 } 513 }() 514 } 515 for _, sandbox := range m.sandboxes { 516 loadSandboxCh <- sandbox 517 } 518 close(loadSandboxCh) 519 for i := 0; i < numSandboxes; i++ { 520 loadedSandboxes = append(loadedSandboxes, <-loadedSandboxesCh) 521 } 522 close(loadedSandboxesCh) 523 return loadedSandboxes 524 } 525 526 // sandboxMetricsResult is the result of calling querySandboxMetrics on a 527 // single sandbox. It contains all of `sandboxLoadResult` but also has current 528 // metric data (if querying metrics from the sandbox process succeeded). 529 type sandboxMetricsResult struct { 530 sandboxLoadResult 531 isRunning bool 532 snapshot *prometheus.Snapshot 533 err error 534 } 535 536 // queryMultiSandboxMetrics queries metric data from multiple loaded sandboxes. 537 // It does so in parallel and with random permutation ordering. 538 // Only metrics matching the `metricsFilter` regular expression are queried. 539 // For each sandbox, whether we were successful in querying its metrics or 540 // not, the `processSandbox` function is called. This may be done in parallel, 541 // so `processSandbox` should do its own locking so that multiple parallel 542 // instances of itself behave appropriately. 543 func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoadResult, metricsFilter string, processSandbox func(sandboxMetricsResult)) { 544 numSandboxes := len(loadedSandboxes) 545 ctxDeadline, ok := ctx.Deadline() 546 if !ok { 547 panic("context had no deadline, this should never happen as it was created with a timeout") 548 } 549 exportStartTime := time.Now() 550 requestTimeLeft := ctxDeadline.Sub(exportStartTime) 551 perSandboxTime := requestTimeLeft 552 if numSandboxes != 0 { 553 perSandboxTime = requestTimeLeft / time.Duration(numSandboxes) 554 } 555 if perSandboxTime < metricsExportPerSandboxTimeout { 556 perSandboxTime = metricsExportPerSandboxTimeout 557 } 558 loadedSandboxCh := make(chan sandboxLoadResult, numSandboxes) 559 var wg sync.WaitGroup 560 numGoroutines := exportParallelGoroutines 561 if numSandboxes < numGoroutines { 562 numGoroutines = numSandboxes 563 } 564 wg.Add(numGoroutines) 565 for i := 0; i < numGoroutines; i++ { 566 go func() { 567 defer wg.Done() 568 for s := range loadedSandboxCh { 569 isRunning := false 570 var snapshot *prometheus.Snapshot 571 err := s.err 572 if err == nil { 573 queryCtx, queryCtxCancel := context.WithTimeout(ctx, perSandboxTime) 574 snapshot, err = querySandboxMetrics(queryCtx, s.sandbox, s.verifier, metricsFilter) 575 queryCtxCancel() 576 isRunning = s.sandbox.IsRunning() 577 } 578 processSandbox(sandboxMetricsResult{ 579 sandboxLoadResult: s, 580 isRunning: isRunning, 581 snapshot: snapshot, 582 err: err, 583 }) 584 } 585 }() 586 } 587 // Iterate over all sandboxes. 588 // Important: This must be done in random order. 589 // A malicious/compromised sandbox may decide to stall when being asked for metrics. 590 // If at least `numGoroutines` sandboxes do this, this will starve other sandboxes 591 // from having their metrics exported, because all the goroutines will be stuck on 592 // the stalled sandboxes. 593 // One way to completely avoid this would be to spawn one goroutine per 594 // sandbox, but this can amount to ~hundreds of goroutines, which is not desirable 595 // for the metrics server. 596 // Another way would be to have a very strict timeout on each sandbox's export 597 // process, but in some cases a busy sandbox will take more than a decisecond 598 // or so to export its data, so this would miss some data from legitimate (but 599 // slow) sandboxes. 600 // Instead, we take a middle-of-the-road approach: we use a timeout that's not 601 // too strict but still ensures we make forward progress away from stalled 602 // sandboxes, and we also iterate across sandboxes in a different random order at 603 // each export. This ensures that all sandboxes eventually get a fair chance of 604 // being part of the "first `numGoroutines` sandboxes in line" to get their 605 // metric data loaded, such that a client repeatedly scraping metrics will 606 // eventually get data from each sandbox. 607 for _, sandboxIndex := range rand.Perm(len(loadedSandboxes)) { 608 loadedSandboxCh <- loadedSandboxes[sandboxIndex] 609 } 610 close(loadedSandboxCh) 611 wg.Wait() 612 } 613 614 // serveMetrics serves metrics requests. 615 func (m *metricServer) serveMetrics(w http.ResponseWriter, req *http.Request) httpResult { 616 ctx, ctxCancel := context.WithTimeout(req.Context(), metricsExportTimeout) 617 defer ctxCancel() 618 619 metricsFilter := req.URL.Query().Get("runsc-sandbox-metrics-filter") 620 var capabilityFilterReg *regexp.Regexp 621 capabilityFilterStr := req.URL.Query().Get("runsc-capability-filter") 622 623 m.mu.Lock() 624 625 if metricsFilter != "" && metricsFilter != m.lastValidMetricFilter { 626 _, err := regexp.Compile(metricsFilter) 627 if err != nil { 628 m.mu.Unlock() 629 return httpResult{http.StatusBadRequest, errors.New("provided metric filter is not a valid regular expression")} 630 } 631 m.lastValidMetricFilter = metricsFilter 632 } 633 if capabilityFilterStr != "" { 634 if capabilityFilterStr != m.lastValidCapabilityFilterStr { 635 reg, err := regexp.Compile(capabilityFilterStr) 636 if err != nil { 637 m.mu.Unlock() 638 return httpResult{http.StatusBadRequest, errors.New("provided capability filter is not a valid regular expression")} 639 } 640 m.lastValidCapabilityFilterStr = capabilityFilterStr 641 m.lastValidCapabilityFilterReg = reg 642 capabilityFilterReg = reg 643 } else { 644 capabilityFilterReg = m.lastValidCapabilityFilterReg 645 } 646 } 647 648 loadedSandboxes := m.loadSandboxesLocked(ctx) 649 numSandboxes := len(loadedSandboxes) 650 numSandboxesTotal := m.numSandboxes 651 m.mu.Unlock() 652 653 // Used to prevent goroutines from accessing the shared variables below. 654 var metricsMu sync.Mutex 655 656 // Meta-metrics keep track of metrics to export about the metrics server itself. 657 type metaMetrics struct { 658 numRunningSandboxes int64 659 numCannotExportSandboxes int64 660 } 661 meta := metaMetrics{} // Protected by metricsMu. 662 selfMetrics := prometheus.NewSnapshot() // Protected by metricsMu. 663 664 type snapshotAndOptions struct { 665 snapshot *prometheus.Snapshot 666 options prometheus.SnapshotExportOptions 667 } 668 snapshotCh := make(chan snapshotAndOptions, numSandboxes) 669 670 queryMultiSandboxMetrics(ctx, loadedSandboxes, metricsFilter, func(r sandboxMetricsResult) { 671 metricsMu.Lock() 672 defer metricsMu.Unlock() 673 selfMetrics.Add(prometheus.LabeledIntData(&SandboxPresenceMetric, nil, 1).SetExternalLabels(r.served.extraLabels)) 674 sandboxRunning := int64(0) 675 if r.isRunning { 676 sandboxRunning = 1 677 meta.numRunningSandboxes++ 678 } 679 selfMetrics.Add(prometheus.LabeledIntData(&SandboxRunningMetric, nil, sandboxRunning).SetExternalLabels(r.served.extraLabels)) 680 if r.err == nil { 681 selfMetrics.Add(prometheus.LabeledIntData(&SandboxMetadataMetric, r.sandbox.MetricMetadata, 1).SetExternalLabels(r.served.extraLabels)) 682 for _, cap := range r.served.capabilities { 683 if capabilityFilterReg != nil && !capabilityFilterReg.MatchString(cap.String()) && !capabilityFilterReg.MatchString(cap.TrimmedString()) { 684 continue 685 } 686 selfMetrics.Add(prometheus.LabeledIntData(&SandboxCapabilitiesMetric, map[string]string{ 687 SandboxCapabilitiesMetricLabel: cap.TrimmedString(), 688 }, 1).SetExternalLabels(r.served.extraLabels)) 689 } 690 selfMetrics.Add(prometheus.LabeledIntData(&SpecMetadataMetric, r.served.specMetadataLabels, 1).SetExternalLabels(r.served.extraLabels)) 691 createdAt := float64(r.served.createdAt.Unix()) + (float64(r.served.createdAt.Nanosecond()) / 1e9) 692 selfMetrics.Add(prometheus.LabeledFloatData(&SandboxCreationMetric, nil, createdAt).SetExternalLabels(r.served.extraLabels)) 693 } else { 694 // If the sandbox isn't running, it is normal that metrics are not exported for it, so 695 // do not report this case as an error. 696 if r.isRunning { 697 meta.numCannotExportSandboxes++ 698 log.Warningf("Could not export metrics from sandbox %s: %v", r.served.rootContainerID.SandboxID, r.err) 699 } 700 return 701 } 702 snapshotCh <- snapshotAndOptions{ 703 snapshot: r.snapshot, 704 options: prometheus.SnapshotExportOptions{ 705 ExporterPrefix: m.exporterPrefix, 706 ExtraLabels: r.served.extraLabels, 707 }, 708 } 709 }) 710 711 // Build the map of all snapshots we will be rendering. 712 snapshotsToOptions := make(map[*prometheus.Snapshot]prometheus.SnapshotExportOptions, numSandboxes+2) 713 snapshotsToOptions[selfMetrics] = prometheus.SnapshotExportOptions{ 714 ExporterPrefix: fmt.Sprintf("%s%s", m.exporterPrefix, prometheus.MetaMetricPrefix), 715 } 716 processMetrics := prometheus.NewSnapshot() 717 processMetrics.Add(prometheus.NewFloatData(&prometheus.ProcessStartTimeSeconds, float64(m.startTime.Unix())+(float64(m.startTime.Nanosecond())/1e9))) 718 snapshotsToOptions[processMetrics] = prometheus.SnapshotExportOptions{ 719 // These metrics must be written without any prefix. 720 } 721 722 // Aggregate all the snapshots from the sandboxes. 723 close(snapshotCh) 724 for snapshotAndOptions := range snapshotCh { 725 snapshotsToOptions[snapshotAndOptions.snapshot] = snapshotAndOptions.options 726 } 727 728 // Add our own metrics. 729 selfMetrics.Add(prometheus.NewIntData(&NumRunningSandboxesMetric, meta.numRunningSandboxes)) 730 selfMetrics.Add(prometheus.NewIntData(&NumCannotExportSandboxesMetric, meta.numCannotExportSandboxes)) 731 selfMetrics.Add(prometheus.NewIntData(&NumTotalSandboxesMetric, numSandboxesTotal)) 732 733 // Write out all data. 734 lastMetricsWrittenSize := int(m.lastMetricsWrittenSize.Load()) 735 metricsWritten := make(map[string]bool, lastMetricsWrittenSize) 736 commentHeader := fmt.Sprintf("Data for runsc metric server exporting data for sandboxes in root directory %s", m.rootDir) 737 if metricsFilter != "" { 738 commentHeader = fmt.Sprintf("%s (filtered using regular expression: %q)", commentHeader, metricsFilter) 739 } 740 written, err := prometheus.Write(w, prometheus.ExportOptions{ 741 CommentHeader: commentHeader, 742 MetricsWritten: metricsWritten, 743 }, snapshotsToOptions) 744 if err != nil { 745 if written == 0 { 746 return httpResult{http.StatusServiceUnavailable, err} 747 } 748 // Note that we cannot return an HTTP error here because we have already started writing a 749 // response, which means we've already responded with a 200 OK status code. 750 // This probably means the client closed the connection before we could finish writing. 751 return httpOK 752 } 753 if lastMetricsWrittenSize < len(metricsWritten) { 754 m.lastMetricsWrittenSize.CompareAndSwap(uint32(lastMetricsWrittenSize), uint32(len(metricsWritten))) 755 } 756 return httpOK 757 } 758 759 // serveHealthCheck serves the healthcheck endpoint. 760 // Returns a response prefixed by "runsc-metrics:OK" on success. 761 // Clients can use this to assert that they are talking to the metrics server, as opposed to some 762 // other random HTTP server. 763 func (m *metricServer) serveHealthCheck(w http.ResponseWriter, req *http.Request) httpResult { 764 m.mu.Lock() 765 defer m.mu.Unlock() 766 if m.shuttingDown { 767 return httpResult{http.StatusServiceUnavailable, errors.New("server is shutting down")} 768 } 769 if err := req.ParseForm(); err != nil { 770 return httpResult{http.StatusBadRequest, err} 771 } 772 rootDir := req.Form.Get("root") 773 if rootDir != m.rootDir { 774 return httpResult{http.StatusBadRequest, fmt.Errorf("this metric server is configured to serve root directory: %s", m.rootDir)} 775 } 776 w.WriteHeader(http.StatusOK) 777 io.WriteString(w, "runsc-metrics:OK") 778 return httpOK 779 } 780 781 // servePID serves the PID of the metric server process. 782 func (m *metricServer) servePID(w http.ResponseWriter, req *http.Request) httpResult { 783 m.mu.Lock() 784 defer m.mu.Unlock() 785 if m.shuttingDown { 786 return httpResult{http.StatusServiceUnavailable, errors.New("server is shutting down")} 787 } 788 io.WriteString(w, strconv.Itoa(m.pid)) 789 return httpOK 790 } 791 792 // Server is the set of options to run a metric server. 793 // Initialize this struct and then call Run on it to run the metric server. 794 type Server struct { 795 // Config is the main runsc configuration. 796 Config *config.Config 797 798 // ExporterPrefix is used as prefix for all metric names following Prometheus exporter convention. 799 ExporterPrefix string 800 801 // PIDFile, if set, will cause the metric server to write its own PID to this file after binding 802 // to the requested address. The parent directory of this file must already exist. 803 PIDFile string 804 805 // ExposeProfileEndpoints, if true, exposes /runsc-metrics/profile-cpu and 806 // /runsc-metrics/profile-heap to get profiling data about the metric server. 807 ExposeProfileEndpoints bool 808 809 // AllowUnknownRoot causes the metric server to keep running regardless of the existence of the 810 // Config's root directory or the metric server's ability to access it. 811 AllowUnknownRoot bool 812 } 813 814 // Run runs the metric server. 815 // It blocks until the server is instructed to exit, e.g. via signal. 816 func (s *Server) Run(ctx context.Context) error { 817 ctx, ctxCancel := context.WithCancel(ctx) 818 defer ctxCancel() 819 820 m := &metricServer{ 821 exporterPrefix: s.ExporterPrefix, 822 pidFile: s.PIDFile, 823 exposeProfileEndpoints: s.ExposeProfileEndpoints, 824 allowUnknownRoot: s.AllowUnknownRoot, 825 } 826 conf := s.Config 827 if conf.MetricServer == "" { 828 return errors.New("config does not specify the metric server address (--metric-server)") 829 } 830 if strings.Contains(conf.MetricServer, "%ID%") { 831 return fmt.Errorf("metric server address contains '%%ID%%': %v; this should have been replaced by the parent process", conf.MetricServer) 832 } 833 if _, err := container.ListSandboxes(conf.RootDir); err != nil { 834 if !m.allowUnknownRoot { 835 return fmt.Errorf("invalid root directory %q: tried to list sandboxes within it and got: %w", conf.RootDir, err) 836 } 837 log.Warningf("Invalid root directory %q: tried to list sandboxes within it and got: %v. Continuing anyway, as the server is configured to tolerate this.", conf.RootDir, err) 838 } 839 // container.ListSandboxes uses a glob pattern, which doesn't error out on 840 // permission errors. Double-check by actually listing the directory. 841 if _, err := ioutil.ReadDir(conf.RootDir); err != nil { 842 if !m.allowUnknownRoot { 843 return fmt.Errorf("invalid root directory %q: tried to list all entries within it and got: %w", conf.RootDir, err) 844 } 845 log.Warningf("Invalid root directory %q: tried to list all entries within it and got: %v. Continuing anyway, as the server is configured to tolerate this.", conf.RootDir, err) 846 } 847 m.startTime = time.Now() 848 m.rootDir = conf.RootDir 849 if strings.Contains(conf.MetricServer, "%RUNTIME_ROOT%") { 850 newAddr := strings.ReplaceAll(conf.MetricServer, "%RUNTIME_ROOT%", m.rootDir) 851 log.Infof("Metric server address replaced %RUNTIME_ROOT%: %q -> %q", conf.MetricServer, newAddr) 852 conf.MetricServer = newAddr 853 } 854 m.address = conf.MetricServer 855 m.sandboxes = make(map[container.FullID]*servedSandbox) 856 m.lastStateFileStat = make(map[container.FullID]os.FileInfo) 857 m.pid = os.Getpid() 858 m.shutdownCh = make(chan os.Signal, 1) 859 signal.Notify(m.shutdownCh, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) 860 861 var listener net.Listener 862 var listenErr error 863 if strings.HasPrefix(conf.MetricServer, fmt.Sprintf("%c", os.PathSeparator)) { 864 beforeBindSt, beforeBindErr := os.Stat(conf.MetricServer) 865 if listener, listenErr = (&net.ListenConfig{}).Listen(ctx, "unix", conf.MetricServer); listenErr != nil { 866 return fmt.Errorf("cannot listen on unix domain socket %q: %w", conf.MetricServer, listenErr) 867 } 868 afterBindSt, afterBindErr := os.Stat(conf.MetricServer) 869 if afterBindErr != nil { 870 return fmt.Errorf("cannot stat our own unix domain socket %q: %w", conf.MetricServer, afterBindErr) 871 } 872 ownUDS := true 873 if beforeBindErr == nil && beforeBindSt.Mode() == afterBindSt.Mode() { 874 // Socket file existed and was a socket prior to us binding to it. 875 if beforeBindSt.Sys() != nil && afterBindSt.Sys() != nil { 876 beforeSt, beforeStOk := beforeBindSt.Sys().(*syscall.Stat_t) 877 afterSt, afterStOk := beforeBindSt.Sys().(*syscall.Stat_t) 878 if beforeStOk && afterStOk && beforeSt.Dev == afterSt.Dev && beforeSt.Ino == afterSt.Ino { 879 // Socket file is the same before and after binding, so we should not consider ourselves 880 // the owner of it. 881 ownUDS = false 882 } 883 } 884 } 885 if ownUDS { 886 log.Infof("Bound on socket file %s which we own. As such, this socket file will be deleted on server shutdown.", conf.MetricServer) 887 m.udsPath = conf.MetricServer 888 defer os.Remove(m.udsPath) 889 os.Chmod(m.udsPath, 0777) 890 } else { 891 log.Infof("Bound on socket file %s which existed prior to this server's existence. As such, it will not be deleted on server shutdown.", conf.MetricServer) 892 } 893 } else { 894 if strings.HasPrefix(conf.MetricServer, ":") { 895 log.Warningf("Binding on all interfaces. This will allow anyone to list all containers on your machine!") 896 } 897 if listener, listenErr = (&net.ListenConfig{}).Listen(ctx, "tcp", conf.MetricServer); listenErr != nil { 898 return fmt.Errorf("cannot listen on TCP address %q: %w", conf.MetricServer, listenErr) 899 } 900 } 901 902 mux := http.NewServeMux() 903 mux.HandleFunc("/runsc-metrics/healthcheck", logRequest(m.serveHealthCheck)) 904 mux.HandleFunc("/runsc-metrics/pid", logRequest(m.servePID)) 905 if m.exposeProfileEndpoints { 906 log.Warningf("Profiling HTTP endpoints are exposed; this should only be used for development!") 907 mux.HandleFunc("/runsc-metrics/profile-cpu", logRequest(m.profileCPU)) 908 mux.HandleFunc("/runsc-metrics/profile-heap", logRequest(m.profileHeap)) 909 } else { 910 // Disable memory profiling, since we don't expose it. 911 runtime.MemProfileRate = 0 912 } 913 mux.HandleFunc("/metrics", logRequest(m.serveMetrics)) 914 mux.HandleFunc("/", logRequest(m.serveIndex)) 915 m.srv.Handler = mux 916 m.srv.ReadTimeout = httpTimeout 917 m.srv.WriteTimeout = httpTimeout 918 if err := m.startVerifyLoop(ctx); err != nil { 919 return fmt.Errorf("cannot start background loop: %w", err) 920 } 921 if m.pidFile != "" { 922 if err := ioutil.WriteFile(m.pidFile, []byte(fmt.Sprintf("%d", m.pid)), 0644); err != nil { 923 return fmt.Errorf("cannot write PID to file %q: %w", m.pidFile, err) 924 } 925 defer os.Remove(m.pidFile) 926 log.Infof("Wrote PID %d to file %v.", m.pid, m.pidFile) 927 } 928 929 // If not modified by the user from the environment, set the Go GC percentage lower than default. 930 if _, hasEnv := os.LookupEnv("GOGC"); !hasEnv { 931 debug.SetGCPercent(40) 932 } 933 934 // Run GC immediately to get rid of all the initialization-related memory bloat and start from 935 // a clean slate. 936 state.Release() 937 runtime.GC() 938 939 // Initialization complete. 940 log.Infof("Server serving on %s for root directory %s.", conf.MetricServer, conf.RootDir) 941 serveErr := m.srv.Serve(listener) 942 log.Infof("Server has stopped accepting requests.") 943 m.mu.Lock() 944 defer m.mu.Unlock() 945 if serveErr != nil { 946 if serveErr == http.ErrServerClosed { 947 return nil 948 } 949 return fmt.Errorf("cannot serve on address %s: %w", conf.MetricServer, serveErr) 950 } 951 // Per documentation, http.Server.Serve can never return a nil error, so this is not a success. 952 return fmt.Errorf("HTTP server Serve() did not return expected error") 953 }