gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/metric/profiling_metric.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package metric 16 17 import ( 18 "bytes" 19 "errors" 20 "fmt" 21 "hash" 22 "hash/adler32" 23 "io" 24 "os" 25 "runtime" 26 "strings" 27 "time" 28 29 "google.golang.org/protobuf/encoding/protojson" 30 "gvisor.dev/gvisor/pkg/atomicbitops" 31 "gvisor.dev/gvisor/pkg/log" 32 "gvisor.dev/gvisor/pkg/prometheus" 33 ) 34 35 const ( 36 // snapshotBufferSize is the number of snapshots within one item of the 37 // ringbuffer. Increasing this number means less context-switching 38 // overhead between collector and writer goroutines, but worse time 39 // precision, as the precise time is refreshed every this many snapshots. 40 snapshotBufferSize = 1024 41 // snapshotRingbufferSize is the number of items in the ringbuffer. 42 // Increasing this number means the writer has more slack to catch up 43 // if it falls behind, but it also means that the collector may need 44 // to wait for longer intervals when the writer does fall behind, 45 // adding more variance to the time gaps between collections. 46 snapshotRingbufferSize = 128 47 // MetricsPrefix is prepended before every metrics line. 48 MetricsPrefix = "GVISOR_METRICS\t" 49 // MetricsHashIndicator is prepended before the hash of the metrics 50 // data at the end of the metrics stream. 51 MetricsHashIndicator = "ADLER32\t" 52 // TimeColumn is the column header for the time column. 53 TimeColumn = "Time (ns)" 54 // MetricsMetaIndicator is prepended before every metrics metadata line 55 // after metricsPrefix. 56 MetricsMetaIndicator = "META\t" 57 // MetricsStartTimeIndicator is prepended before the start time of the 58 // metrics collection. 59 MetricsStartTimeIndicator = "START_TIME\t" 60 ) 61 62 var ( 63 // profilingMetricsStarted indicates whether StartProfilingMetrics has 64 // been called. 65 profilingMetricsStarted atomicbitops.Bool 66 // stopProfilingMetrics is used to signal to the profiling metrics 67 // goroutine to stop recording and writing metrics. 68 stopProfilingMetrics atomicbitops.Bool 69 // doneProfilingMetrics is used to signal that the profiling metrics 70 // goroutines are finished. 71 doneProfilingMetrics chan bool 72 // definedProfilingMetrics is the set of metrics known to be created for 73 // profiling (see condmetric_profiling.go). 74 definedProfilingMetrics []string 75 ) 76 77 // snapshots is used to as temporary storage of metric data 78 // before it's written to the writer. 79 type snapshots struct { 80 numMetrics int 81 // startTime is the time at which collection started in nanoseconds. 82 startTime int64 83 // ringbuffer is used to store metric data. 84 ringbuffer [][]uint64 85 // curWriterIndex is the ringbuffer index currently being read by the 86 // writer. It should not be used by the collector. 87 curWriterIndex atomicbitops.Int32 88 } 89 90 // writeReq is the message sent between from the collector to the writer. 91 type writeReq struct { 92 ringbufferIdx int 93 // numLines indicates how many data lines are filled in the buffer. 94 numLines int 95 } 96 97 // ProfilingMetricsWriter is the interface for profiling metrics sinks. 98 type ProfilingMetricsWriter interface { 99 // WriteString from the io.StringWriter interface. 100 io.StringWriter 101 102 // Close closes the writer. 103 Close() error 104 } 105 106 // ProfilingMetricsOptions is the set of options to profile metrics. 107 type ProfilingMetricsOptions[T ProfilingMetricsWriter] struct { 108 // Sink is the sink to write the profiling metrics data to. 109 Sink T 110 111 // Lossy specifies whether the sink is lossy, i.e. data may be dropped from 112 // too large logging volume. In this case, data integrity is desirable at the 113 // expense of extra CPU cost at data-writing time. The data will be prefixed 114 // with `MetricsPrefix` and the hash of the data will be appended at the end. 115 Lossy bool 116 117 // Metrics is the comma-separated list of metrics to profile. 118 Metrics string 119 120 // Rate is the rate at which the metrics are collected. 121 Rate time.Duration 122 } 123 124 // StartProfilingMetrics checks the ProfilingMetrics runsc flags and creates 125 // goroutines responsible for outputting the profiling metric data. 126 // 127 // Preconditions: 128 // - All metrics are registered. 129 // - Initialize/Disable has been called. 130 func StartProfilingMetrics[T ProfilingMetricsWriter](opts ProfilingMetricsOptions[T]) error { 131 if !initialized.Load() { 132 // Wait for initialization to complete to make sure that all 133 // metrics are registered. 134 return errors.New("metric initialization is not complete") 135 } 136 137 var values []func(fieldValues ...*FieldValue) uint64 138 var headers []string 139 var columnHeaders strings.Builder 140 columnHeaders.WriteString(TimeColumn) 141 numMetrics := 0 142 143 if len(opts.Metrics) > 0 { 144 metrics := strings.Split(opts.Metrics, ",") 145 numMetrics = len(metrics) 146 147 for _, name := range metrics { 148 name := strings.TrimSpace(name) 149 m, ok := allMetrics.uint64Metrics[name] 150 if !ok { 151 return fmt.Errorf("given profiling metric name '%s' does not correspond to a registered Uint64 metric", name) 152 } 153 if len(m.fields) > 0 { 154 // TODO(b/240280155): Add support for field values. 155 return fmt.Errorf("will not profile metric '%s' because it has metric fields which are not supported", name) 156 } 157 var metricMetadataHeader strings.Builder 158 metricMetadataHeader.WriteString(MetricsMetaIndicator) 159 metricMetadataHeader.WriteString(name) 160 metricMetadataHeader.WriteRune('\t') 161 metricMetadata, err := protojson.MarshalOptions{Multiline: false}.Marshal(m.metadata) 162 if err != nil { 163 return fmt.Errorf("failed to marshal metric schema for metric %q: %w", name, err) 164 } 165 metricMetadataHeader.Write(metricMetadata) 166 headers = append(headers, metricMetadataHeader.String()) 167 columnHeaders.WriteRune('\t') 168 columnHeaders.WriteString(name) 169 values = append(values, m.value) 170 } 171 if opts.Lossy { 172 columnHeaders.WriteString("\tChecksum") 173 } 174 } else { 175 if len(definedProfilingMetrics) > 0 { 176 return fmt.Errorf("a value for --profiling-metrics was not specified; consider using a subset of '--profiling-metrics=%s'", strings.Join(definedProfilingMetrics, ",")) 177 } 178 return fmt.Errorf("a value for --profiling-metrics was not specified; also no conditionally compiled metrics found, consider compiling runsc with --go_tag=condmetric_profiling") 179 } 180 headers = append( 181 headers, 182 fmt.Sprintf("%s%d", MetricsStartTimeIndicator, time.Now().UnixNano()), 183 columnHeaders.String(), 184 ) 185 186 if !profilingMetricsStarted.CompareAndSwap(false, true) { 187 return errors.New("profiling metrics have already been started") 188 } 189 s := snapshots{ 190 numMetrics: numMetrics, 191 ringbuffer: make([][]uint64, snapshotRingbufferSize), 192 // curWriterIndex is initialized to a valid index so that the 193 // collector cannot use up all indices before the writer even has 194 // a chance to start (as unlikely as that is). 195 curWriterIndex: atomicbitops.FromInt32(snapshotRingbufferSize - 1), 196 } 197 for i := 0; i < snapshotRingbufferSize; i++ { 198 s.ringbuffer[i] = make([]uint64, snapshotBufferSize*(numMetrics+1)) 199 } 200 201 stopProfilingMetrics = atomicbitops.FromBool(false) 202 doneProfilingMetrics = make(chan bool, 1) 203 writeCh := make(chan writeReq, snapshotRingbufferSize) 204 s.startTime = time.Now().UnixNano() 205 cheapStartTime := CheapNowNano() 206 go collectProfilingMetrics(&s, values, cheapStartTime, opts.Rate, writeCh) 207 if opts.Lossy { 208 lossySink := newLossyBufferedWriter(opts.Sink) 209 go writeProfilingMetrics[*lossyBufferedWriter[T]](lossySink, &s, headers, writeCh) 210 } else { 211 bufferedSink := newBufferedWriter(opts.Sink) 212 go writeProfilingMetrics[*bufferedWriter[T]](bufferedSink, &s, headers, writeCh) 213 } 214 log.Infof("Profiling metrics started.") 215 216 return nil 217 } 218 219 // collectProfilingMetrics will send metrics to the writeCh until it receives a 220 // signal via the stopProfilingMetrics channel. 221 func collectProfilingMetrics(s *snapshots, values []func(fieldValues ...*FieldValue) uint64, cheapStartTime int64, profilingRate time.Duration, writeCh chan<- writeReq) { 222 defer close(writeCh) 223 224 numEntries := s.numMetrics + 1 // to account for the timestamp 225 ringbufferIdx := 0 226 curSnapshot := 0 227 228 // If we write faster than the writer can keep up, we back off. 229 // The backoff factor starts small but increases exponentially 230 // each time we find that we are still faster than the writer. 231 const ( 232 // How much slower than the profiling rate we sleep for, as a 233 // multiplier for the profiling rate. 234 initialBackoffFactor = 1.0 235 236 // The exponential factor by which the backoff factor increases. 237 backoffFactorGrowth = 1.125 238 239 // The maximum backoff factor, i.e. the maximum multiplier of 240 // the profiling rate for which we sleep. 241 backoffFactorMax = 256.0 242 ) 243 backoffFactor := initialBackoffFactor 244 245 // To keep track of time cheaply, we use `CheapNowNano`. 246 // However, this can drift as it has poor precision. 247 // To get something more precise, we periodically call `time.Now` 248 // and `CheapNowNano` and use these two variables to track both. 249 // This way, we can compute a more precise time by using 250 // `CheapNowNano() - cheapTime + preciseTime`. 251 preciseTime := s.startTime 252 cheapTime := cheapStartTime 253 254 stopCollecting := false 255 for nextCollection := s.startTime; !stopCollecting; nextCollection += profilingRate.Nanoseconds() { 256 257 // For small durations, just spin. Otherwise sleep. 258 for { 259 const ( 260 wakeUpNanos = 10 261 spinMaxNanos = 250 262 yieldMaxNanos = 1_000 263 ) 264 now := CheapNowNano() - cheapTime + preciseTime 265 nanosToNextCollection := nextCollection - now 266 if nanosToNextCollection <= 0 { 267 // Collect now. 268 break 269 } 270 if nanosToNextCollection < spinMaxNanos { 271 continue // Spin. 272 } 273 if nanosToNextCollection < yieldMaxNanos { 274 // Yield then spin. 275 runtime.Gosched() 276 continue 277 } 278 // Sleep. 279 time.Sleep(time.Duration(nanosToNextCollection-wakeUpNanos) * time.Nanosecond) 280 } 281 282 if stopProfilingMetrics.Load() { 283 stopCollecting = true 284 // Collect one last time before stopping. 285 } 286 287 collectStart := CheapNowNano() - cheapTime + preciseTime 288 timestamp := time.Duration(collectStart - s.startTime) 289 base := curSnapshot * numEntries 290 ringBuf := s.ringbuffer[ringbufferIdx] 291 ringBuf[base] = uint64(timestamp) 292 for i := 1; i < numEntries; i++ { 293 ringBuf[base+i] = values[i-1]() 294 } 295 curSnapshot++ 296 297 if curSnapshot == snapshotBufferSize { 298 writeCh <- writeReq{ringbufferIdx: ringbufferIdx, numLines: curSnapshot} 299 curSnapshot = 0 300 // Block until the writer indicates that this part of the ringbuffer 301 // is available for writing. 302 for ringbufferIdx = (ringbufferIdx + 1) % snapshotRingbufferSize; ringbufferIdx == int(s.curWriterIndex.Load()); { 303 // Going too fast, stop collecting for a bit. 304 backoffSleep := profilingRate * time.Duration(backoffFactor) 305 log.Warningf("Profiling metrics collector exhausted the entire ringbuffer... backing off for %v to let writer catch up.", backoffSleep) 306 time.Sleep(backoffSleep) 307 backoffFactor = min(backoffFactor*backoffFactorGrowth, backoffFactorMax) 308 } 309 // Refresh precise time. 310 preciseTime = time.Now().UnixNano() 311 cheapTime = CheapNowNano() 312 } 313 } 314 if curSnapshot != 0 { 315 writeCh <- writeReq{ringbufferIdx: ringbufferIdx, numLines: curSnapshot} 316 } 317 } 318 319 // bufferedMetricsWriter is a ProfilingMetricsWriter that buffers data 320 // before writing it to some underlying writer. 321 type bufferedMetricsWriter interface { 322 // We inherit from the ProfilingMetricsWriter interface. 323 // Note however that calls to WriteString should *not* contain any 324 // newline character, unless called through NewLine. 325 ProfilingMetricsWriter 326 327 // NewLine writes a newline character to the buffer. 328 // The writer may decide to flush the buffer at this point. 329 NewLine() 330 331 // Flush flushes the buffer to the underlying writer. 332 Flush() 333 } 334 335 const ( 336 // Buffer size reasonable to use for a single line of metric data. 337 lineBufSize = 4 * 1024 // 4 KiB 338 339 // Buffer size for a buffered write to an underlying sink. 340 bufSize = 984 * 1024 // 984 KiB 341 342 // Number of lines to buffer before flushing to the underlying sink 343 // by a line-buffered writer. 344 bufferedLines = bufSize / lineBufSize 345 ) 346 347 // bufferedWriter is a buffered metrics writer that wraps an underlying 348 // ProfilingMetricsWriter. 349 // It implements `bufferedMetricsWriter`. 350 type bufferedWriter[T ProfilingMetricsWriter] struct { 351 buf bytes.Buffer 352 underlying T 353 } 354 355 func newBufferedWriter[T ProfilingMetricsWriter](underlying T) *bufferedWriter[T] { 356 w := &bufferedWriter[T]{underlying: underlying} 357 w.buf.Grow(bufSize + lineBufSize) 358 return w 359 } 360 361 // WriteString implements bufferedMetricsWriter.WriteString. 362 func (w *bufferedWriter[T]) WriteString(s string) (int, error) { 363 return w.buf.WriteString(s) 364 } 365 366 // NewLine implements bufferedMetricsWriter.NewLine. 367 func (w *bufferedWriter[T]) NewLine() { 368 w.buf.WriteString("\n") 369 if w.buf.Len() >= bufSize { 370 w.Flush() 371 } 372 } 373 374 // Flush implements bufferedMetricsWriter.Flush. 375 func (w *bufferedWriter[T]) Flush() { 376 w.underlying.WriteString(w.buf.String()) 377 w.buf.Reset() 378 } 379 380 // Close implements bufferedMetricsWriter.Close. 381 func (w *bufferedWriter[T]) Close() error { 382 w.Flush() 383 return w.underlying.Close() 384 } 385 386 // lossyBufferedWriter writes to an underlying ProfilingMetricsWriter 387 // and buffers data on a per-line basis. It adds a prefix to every line, 388 // and keeps track of the checksum of the data it has written (which is then 389 // also written to the underlying writer on `Close()`). 390 // The checksum covers all of the per-line data written after the line prefix, 391 // including the newline character of these lines, with the exception of 392 // the checksum data line itself. 393 // All lines are also checksummed individually, with the checksum covering 394 // the contents of the line after the line prefix but before the tab and 395 // line checksum itself at the end of the line. 396 // `lossyBufferedWriter` implements `bufferedMetricsWriter`. 397 type lossyBufferedWriter[T ProfilingMetricsWriter] struct { 398 lineBuf bytes.Buffer 399 flushBuf bytes.Buffer 400 lineHasher hash.Hash32 401 overallHasher hash.Hash32 402 lines int 403 longestLine int 404 underlying T 405 } 406 407 // newLossyBufferedWriter creates a new lossyBufferedWriter. 408 func newLossyBufferedWriter[T ProfilingMetricsWriter](underlying T) *lossyBufferedWriter[T] { 409 w := &lossyBufferedWriter[T]{ 410 underlying: underlying, 411 lineHasher: adler32.New(), 412 overallHasher: adler32.New(), 413 longestLine: lineBufSize, 414 } 415 w.lineBuf.Grow(lineBufSize) 416 417 // `lineBufSize + 1` to account for the newline at the end of each line. 418 // `+ 2` to account for the newline at the beginning and end of each flush. 419 w.flushBuf.Grow((lineBufSize+1)*bufferedLines + 2) 420 421 w.flushBuf.WriteString("\n") 422 return w 423 } 424 425 // WriteString implements bufferedMetricsWriter.WriteString. 426 func (w *lossyBufferedWriter[T]) WriteString(s string) (int, error) { 427 return w.lineBuf.WriteString(s) 428 } 429 430 // Flush implements bufferedMetricsWriter.Flush. 431 func (w *lossyBufferedWriter[T]) Flush() { 432 if w.lines > 0 { 433 // Ensure that we write a complete line atomically, as this 434 // may get parsed while being mixed with other logs that may not 435 // have clean line endings a the time we print this. 436 w.flushBuf.WriteString("\n") 437 w.underlying.WriteString(w.flushBuf.String()) 438 if f, isFile := any(w.underlying).(*os.File); isFile { 439 // If we're dealing with a file, also call `sync(2)`. 440 f.Sync() 441 } 442 w.flushBuf.Reset() 443 w.flushBuf.WriteString("\n") 444 w.lines = 0 445 } 446 } 447 448 // NewLine implements bufferedMetricsWriter.NewLine. 449 func (w *lossyBufferedWriter[T]) NewLine() { 450 if lineLen := w.lineBuf.Len(); lineLen > w.longestLine { 451 wantTotalSize := (lineLen+1)*bufferedLines + 2 452 if growBy := wantTotalSize - w.flushBuf.Len(); growBy > 0 { 453 w.flushBuf.Grow(growBy) 454 } 455 w.longestLine = lineLen 456 } 457 line := w.lineBuf.String() 458 w.lineHasher.Reset() 459 w.lineHasher.Write([]byte(line)) 460 lineHash := w.lineHasher.Sum32() 461 w.lineBuf.Reset() 462 w.flushBuf.WriteString(MetricsPrefix) 463 beforeLineIndex := w.flushBuf.Len() 464 w.flushBuf.WriteString(line) 465 w.flushBuf.WriteString("\t0x") 466 prometheus.WriteHex(&w.flushBuf, uint64(lineHash)) 467 w.flushBuf.WriteString("\n") 468 afterLineIndex := w.flushBuf.Len() 469 // We ignore the effects that partial writes on the underlying writer 470 // would have on the hash computation here. 471 // This is OK because the goal of this writer is speed over correctness, 472 // and correctness is enforced by the reader of this data checking the 473 // hash at the end. 474 w.overallHasher.Write(w.flushBuf.Bytes()[beforeLineIndex:afterLineIndex]) 475 w.lineBuf.Reset() 476 w.lines++ 477 if w.lines >= bufferedLines || w.flushBuf.Len() >= bufSize { 478 w.Flush() 479 } 480 } 481 482 // Close implements bufferedMetricsWriter.Close. 483 // It writes the checksum of the data written to the underlying writer. 484 func (w *lossyBufferedWriter[T]) Close() error { 485 w.Flush() 486 w.flushBuf.WriteString(MetricsPrefix) 487 w.flushBuf.WriteString(MetricsHashIndicator) 488 w.flushBuf.WriteString("0x") 489 prometheus.WriteHex(&w.flushBuf, uint64(w.overallHasher.Sum32())) 490 w.flushBuf.WriteString("\n") 491 w.underlying.WriteString(w.flushBuf.String()) 492 w.overallHasher.Reset() 493 w.lineBuf.Reset() 494 w.flushBuf.Reset() 495 return w.underlying.Close() 496 } 497 498 // writeProfilingMetrics will write to the ProfilingMetricsWriter on every 499 // request via writeReqs, until writeReqs is closed. 500 func writeProfilingMetrics[T bufferedMetricsWriter](sink T, s *snapshots, headers []string, writeReqs <-chan writeReq) { 501 numEntries := s.numMetrics + 1 502 for _, header := range headers { 503 sink.WriteString(header) 504 sink.NewLine() 505 } 506 for req := range writeReqs { 507 s.curWriterIndex.Store(int32(req.ringbufferIdx)) 508 ringBuf := s.ringbuffer[req.ringbufferIdx] 509 for i := 0; i < req.numLines; i++ { 510 base := i * numEntries 511 // Write the time 512 prometheus.WriteInteger(sink, int64(ringBuf[base])) 513 // Then everything else 514 for j := 1; j < numEntries; j++ { 515 sink.WriteString("\t") 516 prometheus.WriteInteger(sink, int64(ringBuf[base+j])) 517 } 518 sink.NewLine() 519 } 520 } 521 sink.Close() 522 523 doneProfilingMetrics <- true 524 close(doneProfilingMetrics) 525 profilingMetricsStarted.Store(false) 526 } 527 528 // StopProfilingMetrics stops the profiling metrics goroutines. Call to make sure 529 // all metric data has been flushed. 530 // Note that calling this function prior to StartProfilingMetrics has no effect. 531 func StopProfilingMetrics() { 532 if !profilingMetricsStarted.Load() { 533 return 534 } 535 if stopProfilingMetrics.CompareAndSwap(false, true) { 536 <-doneProfilingMetrics 537 } 538 // If the CAS fails, this means the signal was already sent, 539 // so don't wait on doneProfilingMetrics. 540 }