gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/metric/profiling_metric.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package metric
    16  
    17  import (
    18  	"bytes"
    19  	"errors"
    20  	"fmt"
    21  	"hash"
    22  	"hash/adler32"
    23  	"io"
    24  	"os"
    25  	"runtime"
    26  	"strings"
    27  	"time"
    28  
    29  	"google.golang.org/protobuf/encoding/protojson"
    30  	"gvisor.dev/gvisor/pkg/atomicbitops"
    31  	"gvisor.dev/gvisor/pkg/log"
    32  	"gvisor.dev/gvisor/pkg/prometheus"
    33  )
    34  
    35  const (
    36  	// snapshotBufferSize is the number of snapshots within one item of the
    37  	// ringbuffer. Increasing this number means less context-switching
    38  	// overhead between collector and writer goroutines, but worse time
    39  	// precision, as the precise time is refreshed every this many snapshots.
    40  	snapshotBufferSize = 1024
    41  	// snapshotRingbufferSize is the number of items in the ringbuffer.
    42  	// Increasing this number means the writer has more slack to catch up
    43  	// if it falls behind, but it also means that the collector may need
    44  	// to wait for longer intervals when the writer does fall behind,
    45  	// adding more variance to the time gaps between collections.
    46  	snapshotRingbufferSize = 128
    47  	// MetricsPrefix is prepended before every metrics line.
    48  	MetricsPrefix = "GVISOR_METRICS\t"
    49  	// MetricsHashIndicator is prepended before the hash of the metrics
    50  	// data at the end of the metrics stream.
    51  	MetricsHashIndicator = "ADLER32\t"
    52  	// TimeColumn is the column header for the time column.
    53  	TimeColumn = "Time (ns)"
    54  	// MetricsMetaIndicator is prepended before every metrics metadata line
    55  	// after metricsPrefix.
    56  	MetricsMetaIndicator = "META\t"
    57  	// MetricsStartTimeIndicator is prepended before the start time of the
    58  	// metrics collection.
    59  	MetricsStartTimeIndicator = "START_TIME\t"
    60  )
    61  
    62  var (
    63  	// profilingMetricsStarted indicates whether StartProfilingMetrics has
    64  	// been called.
    65  	profilingMetricsStarted atomicbitops.Bool
    66  	// stopProfilingMetrics is used to signal to the profiling metrics
    67  	// goroutine to stop recording and writing metrics.
    68  	stopProfilingMetrics atomicbitops.Bool
    69  	// doneProfilingMetrics is used to signal that the profiling metrics
    70  	// goroutines are finished.
    71  	doneProfilingMetrics chan bool
    72  	// definedProfilingMetrics is the set of metrics known to be created for
    73  	// profiling (see condmetric_profiling.go).
    74  	definedProfilingMetrics []string
    75  )
    76  
    77  // snapshots is used to as temporary storage of metric data
    78  // before it's written to the writer.
    79  type snapshots struct {
    80  	numMetrics int
    81  	// startTime is the time at which collection started in nanoseconds.
    82  	startTime int64
    83  	// ringbuffer is used to store metric data.
    84  	ringbuffer [][]uint64
    85  	// curWriterIndex is the ringbuffer index currently being read by the
    86  	// writer. It should not be used by the collector.
    87  	curWriterIndex atomicbitops.Int32
    88  }
    89  
    90  // writeReq is the message sent between from the collector to the writer.
    91  type writeReq struct {
    92  	ringbufferIdx int
    93  	// numLines indicates how many data lines are filled in the buffer.
    94  	numLines int
    95  }
    96  
    97  // ProfilingMetricsWriter is the interface for profiling metrics sinks.
    98  type ProfilingMetricsWriter interface {
    99  	// WriteString from the io.StringWriter interface.
   100  	io.StringWriter
   101  
   102  	// Close closes the writer.
   103  	Close() error
   104  }
   105  
   106  // ProfilingMetricsOptions is the set of options to profile metrics.
   107  type ProfilingMetricsOptions[T ProfilingMetricsWriter] struct {
   108  	// Sink is the sink to write the profiling metrics data to.
   109  	Sink T
   110  
   111  	// Lossy specifies whether the sink is lossy, i.e. data may be dropped from
   112  	// too large logging volume. In this case, data integrity is desirable at the
   113  	// expense of extra CPU cost at data-writing time. The data will be prefixed
   114  	// with `MetricsPrefix` and the hash of the data will be appended at the end.
   115  	Lossy bool
   116  
   117  	// Metrics is the comma-separated list of metrics to profile.
   118  	Metrics string
   119  
   120  	// Rate is the rate at which the metrics are collected.
   121  	Rate time.Duration
   122  }
   123  
   124  // StartProfilingMetrics checks the ProfilingMetrics runsc flags and creates
   125  // goroutines responsible for outputting the profiling metric data.
   126  //
   127  // Preconditions:
   128  //   - All metrics are registered.
   129  //   - Initialize/Disable has been called.
   130  func StartProfilingMetrics[T ProfilingMetricsWriter](opts ProfilingMetricsOptions[T]) error {
   131  	if !initialized.Load() {
   132  		// Wait for initialization to complete to make sure that all
   133  		// metrics are registered.
   134  		return errors.New("metric initialization is not complete")
   135  	}
   136  
   137  	var values []func(fieldValues ...*FieldValue) uint64
   138  	var headers []string
   139  	var columnHeaders strings.Builder
   140  	columnHeaders.WriteString(TimeColumn)
   141  	numMetrics := 0
   142  
   143  	if len(opts.Metrics) > 0 {
   144  		metrics := strings.Split(opts.Metrics, ",")
   145  		numMetrics = len(metrics)
   146  
   147  		for _, name := range metrics {
   148  			name := strings.TrimSpace(name)
   149  			m, ok := allMetrics.uint64Metrics[name]
   150  			if !ok {
   151  				return fmt.Errorf("given profiling metric name '%s' does not correspond to a registered Uint64 metric", name)
   152  			}
   153  			if len(m.fields) > 0 {
   154  				// TODO(b/240280155): Add support for field values.
   155  				return fmt.Errorf("will not profile metric '%s' because it has metric fields which are not supported", name)
   156  			}
   157  			var metricMetadataHeader strings.Builder
   158  			metricMetadataHeader.WriteString(MetricsMetaIndicator)
   159  			metricMetadataHeader.WriteString(name)
   160  			metricMetadataHeader.WriteRune('\t')
   161  			metricMetadata, err := protojson.MarshalOptions{Multiline: false}.Marshal(m.metadata)
   162  			if err != nil {
   163  				return fmt.Errorf("failed to marshal metric schema for metric %q: %w", name, err)
   164  			}
   165  			metricMetadataHeader.Write(metricMetadata)
   166  			headers = append(headers, metricMetadataHeader.String())
   167  			columnHeaders.WriteRune('\t')
   168  			columnHeaders.WriteString(name)
   169  			values = append(values, m.value)
   170  		}
   171  		if opts.Lossy {
   172  			columnHeaders.WriteString("\tChecksum")
   173  		}
   174  	} else {
   175  		if len(definedProfilingMetrics) > 0 {
   176  			return fmt.Errorf("a value for --profiling-metrics was not specified; consider using a subset of '--profiling-metrics=%s'", strings.Join(definedProfilingMetrics, ","))
   177  		}
   178  		return fmt.Errorf("a value for --profiling-metrics was not specified; also no conditionally compiled metrics found, consider compiling runsc with --go_tag=condmetric_profiling")
   179  	}
   180  	headers = append(
   181  		headers,
   182  		fmt.Sprintf("%s%d", MetricsStartTimeIndicator, time.Now().UnixNano()),
   183  		columnHeaders.String(),
   184  	)
   185  
   186  	if !profilingMetricsStarted.CompareAndSwap(false, true) {
   187  		return errors.New("profiling metrics have already been started")
   188  	}
   189  	s := snapshots{
   190  		numMetrics: numMetrics,
   191  		ringbuffer: make([][]uint64, snapshotRingbufferSize),
   192  		// curWriterIndex is initialized to a valid index so that the
   193  		// collector cannot use up all indices before the writer even has
   194  		// a chance to start (as unlikely as that is).
   195  		curWriterIndex: atomicbitops.FromInt32(snapshotRingbufferSize - 1),
   196  	}
   197  	for i := 0; i < snapshotRingbufferSize; i++ {
   198  		s.ringbuffer[i] = make([]uint64, snapshotBufferSize*(numMetrics+1))
   199  	}
   200  
   201  	stopProfilingMetrics = atomicbitops.FromBool(false)
   202  	doneProfilingMetrics = make(chan bool, 1)
   203  	writeCh := make(chan writeReq, snapshotRingbufferSize)
   204  	s.startTime = time.Now().UnixNano()
   205  	cheapStartTime := CheapNowNano()
   206  	go collectProfilingMetrics(&s, values, cheapStartTime, opts.Rate, writeCh)
   207  	if opts.Lossy {
   208  		lossySink := newLossyBufferedWriter(opts.Sink)
   209  		go writeProfilingMetrics[*lossyBufferedWriter[T]](lossySink, &s, headers, writeCh)
   210  	} else {
   211  		bufferedSink := newBufferedWriter(opts.Sink)
   212  		go writeProfilingMetrics[*bufferedWriter[T]](bufferedSink, &s, headers, writeCh)
   213  	}
   214  	log.Infof("Profiling metrics started.")
   215  
   216  	return nil
   217  }
   218  
   219  // collectProfilingMetrics will send metrics to the writeCh until it receives a
   220  // signal via the stopProfilingMetrics channel.
   221  func collectProfilingMetrics(s *snapshots, values []func(fieldValues ...*FieldValue) uint64, cheapStartTime int64, profilingRate time.Duration, writeCh chan<- writeReq) {
   222  	defer close(writeCh)
   223  
   224  	numEntries := s.numMetrics + 1 // to account for the timestamp
   225  	ringbufferIdx := 0
   226  	curSnapshot := 0
   227  
   228  	// If we write faster than the writer can keep up, we back off.
   229  	// The backoff factor starts small but increases exponentially
   230  	// each time we find that we are still faster than the writer.
   231  	const (
   232  		// How much slower than the profiling rate we sleep for, as a
   233  		// multiplier for the profiling rate.
   234  		initialBackoffFactor = 1.0
   235  
   236  		// The exponential factor by which the backoff factor increases.
   237  		backoffFactorGrowth = 1.125
   238  
   239  		// The maximum backoff factor, i.e. the maximum multiplier of
   240  		// the profiling rate for which we sleep.
   241  		backoffFactorMax = 256.0
   242  	)
   243  	backoffFactor := initialBackoffFactor
   244  
   245  	// To keep track of time cheaply, we use `CheapNowNano`.
   246  	// However, this can drift as it has poor precision.
   247  	// To get something more precise, we periodically call `time.Now`
   248  	// and `CheapNowNano` and use these two variables to track both.
   249  	// This way, we can compute a more precise time by using
   250  	// `CheapNowNano() - cheapTime + preciseTime`.
   251  	preciseTime := s.startTime
   252  	cheapTime := cheapStartTime
   253  
   254  	stopCollecting := false
   255  	for nextCollection := s.startTime; !stopCollecting; nextCollection += profilingRate.Nanoseconds() {
   256  
   257  		// For small durations, just spin. Otherwise sleep.
   258  		for {
   259  			const (
   260  				wakeUpNanos   = 10
   261  				spinMaxNanos  = 250
   262  				yieldMaxNanos = 1_000
   263  			)
   264  			now := CheapNowNano() - cheapTime + preciseTime
   265  			nanosToNextCollection := nextCollection - now
   266  			if nanosToNextCollection <= 0 {
   267  				// Collect now.
   268  				break
   269  			}
   270  			if nanosToNextCollection < spinMaxNanos {
   271  				continue // Spin.
   272  			}
   273  			if nanosToNextCollection < yieldMaxNanos {
   274  				// Yield then spin.
   275  				runtime.Gosched()
   276  				continue
   277  			}
   278  			// Sleep.
   279  			time.Sleep(time.Duration(nanosToNextCollection-wakeUpNanos) * time.Nanosecond)
   280  		}
   281  
   282  		if stopProfilingMetrics.Load() {
   283  			stopCollecting = true
   284  			// Collect one last time before stopping.
   285  		}
   286  
   287  		collectStart := CheapNowNano() - cheapTime + preciseTime
   288  		timestamp := time.Duration(collectStart - s.startTime)
   289  		base := curSnapshot * numEntries
   290  		ringBuf := s.ringbuffer[ringbufferIdx]
   291  		ringBuf[base] = uint64(timestamp)
   292  		for i := 1; i < numEntries; i++ {
   293  			ringBuf[base+i] = values[i-1]()
   294  		}
   295  		curSnapshot++
   296  
   297  		if curSnapshot == snapshotBufferSize {
   298  			writeCh <- writeReq{ringbufferIdx: ringbufferIdx, numLines: curSnapshot}
   299  			curSnapshot = 0
   300  			// Block until the writer indicates that this part of the ringbuffer
   301  			// is available for writing.
   302  			for ringbufferIdx = (ringbufferIdx + 1) % snapshotRingbufferSize; ringbufferIdx == int(s.curWriterIndex.Load()); {
   303  				// Going too fast, stop collecting for a bit.
   304  				backoffSleep := profilingRate * time.Duration(backoffFactor)
   305  				log.Warningf("Profiling metrics collector exhausted the entire ringbuffer... backing off for %v to let writer catch up.", backoffSleep)
   306  				time.Sleep(backoffSleep)
   307  				backoffFactor = min(backoffFactor*backoffFactorGrowth, backoffFactorMax)
   308  			}
   309  			// Refresh precise time.
   310  			preciseTime = time.Now().UnixNano()
   311  			cheapTime = CheapNowNano()
   312  		}
   313  	}
   314  	if curSnapshot != 0 {
   315  		writeCh <- writeReq{ringbufferIdx: ringbufferIdx, numLines: curSnapshot}
   316  	}
   317  }
   318  
   319  // bufferedMetricsWriter is a ProfilingMetricsWriter that buffers data
   320  // before writing it to some underlying writer.
   321  type bufferedMetricsWriter interface {
   322  	// We inherit from the ProfilingMetricsWriter interface.
   323  	// Note however that calls to WriteString should *not* contain any
   324  	// newline character, unless called through NewLine.
   325  	ProfilingMetricsWriter
   326  
   327  	// NewLine writes a newline character to the buffer.
   328  	// The writer may decide to flush the buffer at this point.
   329  	NewLine()
   330  
   331  	// Flush flushes the buffer to the underlying writer.
   332  	Flush()
   333  }
   334  
   335  const (
   336  	// Buffer size reasonable to use for a single line of metric data.
   337  	lineBufSize = 4 * 1024 // 4 KiB
   338  
   339  	// Buffer size for a buffered write to an underlying sink.
   340  	bufSize = 984 * 1024 // 984 KiB
   341  
   342  	// Number of lines to buffer before flushing to the underlying sink
   343  	// by a line-buffered writer.
   344  	bufferedLines = bufSize / lineBufSize
   345  )
   346  
   347  // bufferedWriter is a buffered metrics writer that wraps an underlying
   348  // ProfilingMetricsWriter.
   349  // It implements `bufferedMetricsWriter`.
   350  type bufferedWriter[T ProfilingMetricsWriter] struct {
   351  	buf        bytes.Buffer
   352  	underlying T
   353  }
   354  
   355  func newBufferedWriter[T ProfilingMetricsWriter](underlying T) *bufferedWriter[T] {
   356  	w := &bufferedWriter[T]{underlying: underlying}
   357  	w.buf.Grow(bufSize + lineBufSize)
   358  	return w
   359  }
   360  
   361  // WriteString implements bufferedMetricsWriter.WriteString.
   362  func (w *bufferedWriter[T]) WriteString(s string) (int, error) {
   363  	return w.buf.WriteString(s)
   364  }
   365  
   366  // NewLine implements bufferedMetricsWriter.NewLine.
   367  func (w *bufferedWriter[T]) NewLine() {
   368  	w.buf.WriteString("\n")
   369  	if w.buf.Len() >= bufSize {
   370  		w.Flush()
   371  	}
   372  }
   373  
   374  // Flush implements bufferedMetricsWriter.Flush.
   375  func (w *bufferedWriter[T]) Flush() {
   376  	w.underlying.WriteString(w.buf.String())
   377  	w.buf.Reset()
   378  }
   379  
   380  // Close implements bufferedMetricsWriter.Close.
   381  func (w *bufferedWriter[T]) Close() error {
   382  	w.Flush()
   383  	return w.underlying.Close()
   384  }
   385  
   386  // lossyBufferedWriter writes to an underlying ProfilingMetricsWriter
   387  // and buffers data on a per-line basis. It adds a prefix to every line,
   388  // and keeps track of the checksum of the data it has written (which is then
   389  // also written to the underlying writer on `Close()`).
   390  // The checksum covers all of the per-line data written after the line prefix,
   391  // including the newline character of these lines, with the exception of
   392  // the checksum data line itself.
   393  // All lines are also checksummed individually, with the checksum covering
   394  // the contents of the line after the line prefix but before the tab and
   395  // line checksum itself at the end of the line.
   396  // `lossyBufferedWriter` implements `bufferedMetricsWriter`.
   397  type lossyBufferedWriter[T ProfilingMetricsWriter] struct {
   398  	lineBuf       bytes.Buffer
   399  	flushBuf      bytes.Buffer
   400  	lineHasher    hash.Hash32
   401  	overallHasher hash.Hash32
   402  	lines         int
   403  	longestLine   int
   404  	underlying    T
   405  }
   406  
   407  // newLossyBufferedWriter creates a new lossyBufferedWriter.
   408  func newLossyBufferedWriter[T ProfilingMetricsWriter](underlying T) *lossyBufferedWriter[T] {
   409  	w := &lossyBufferedWriter[T]{
   410  		underlying:    underlying,
   411  		lineHasher:    adler32.New(),
   412  		overallHasher: adler32.New(),
   413  		longestLine:   lineBufSize,
   414  	}
   415  	w.lineBuf.Grow(lineBufSize)
   416  
   417  	// `lineBufSize + 1` to account for the newline at the end of each line.
   418  	// `+ 2` to account for the newline at the beginning and end of each flush.
   419  	w.flushBuf.Grow((lineBufSize+1)*bufferedLines + 2)
   420  
   421  	w.flushBuf.WriteString("\n")
   422  	return w
   423  }
   424  
   425  // WriteString implements bufferedMetricsWriter.WriteString.
   426  func (w *lossyBufferedWriter[T]) WriteString(s string) (int, error) {
   427  	return w.lineBuf.WriteString(s)
   428  }
   429  
   430  // Flush implements bufferedMetricsWriter.Flush.
   431  func (w *lossyBufferedWriter[T]) Flush() {
   432  	if w.lines > 0 {
   433  		// Ensure that we write a complete line atomically, as this
   434  		// may get parsed while being mixed with other logs that may not
   435  		// have clean line endings a the time we print this.
   436  		w.flushBuf.WriteString("\n")
   437  		w.underlying.WriteString(w.flushBuf.String())
   438  		if f, isFile := any(w.underlying).(*os.File); isFile {
   439  			// If we're dealing with a file, also call `sync(2)`.
   440  			f.Sync()
   441  		}
   442  		w.flushBuf.Reset()
   443  		w.flushBuf.WriteString("\n")
   444  		w.lines = 0
   445  	}
   446  }
   447  
   448  // NewLine implements bufferedMetricsWriter.NewLine.
   449  func (w *lossyBufferedWriter[T]) NewLine() {
   450  	if lineLen := w.lineBuf.Len(); lineLen > w.longestLine {
   451  		wantTotalSize := (lineLen+1)*bufferedLines + 2
   452  		if growBy := wantTotalSize - w.flushBuf.Len(); growBy > 0 {
   453  			w.flushBuf.Grow(growBy)
   454  		}
   455  		w.longestLine = lineLen
   456  	}
   457  	line := w.lineBuf.String()
   458  	w.lineHasher.Reset()
   459  	w.lineHasher.Write([]byte(line))
   460  	lineHash := w.lineHasher.Sum32()
   461  	w.lineBuf.Reset()
   462  	w.flushBuf.WriteString(MetricsPrefix)
   463  	beforeLineIndex := w.flushBuf.Len()
   464  	w.flushBuf.WriteString(line)
   465  	w.flushBuf.WriteString("\t0x")
   466  	prometheus.WriteHex(&w.flushBuf, uint64(lineHash))
   467  	w.flushBuf.WriteString("\n")
   468  	afterLineIndex := w.flushBuf.Len()
   469  	// We ignore the effects that partial writes on the underlying writer
   470  	// would have on the hash computation here.
   471  	// This is OK because the goal of this writer is speed over correctness,
   472  	// and correctness is enforced by the reader of this data checking the
   473  	// hash at the end.
   474  	w.overallHasher.Write(w.flushBuf.Bytes()[beforeLineIndex:afterLineIndex])
   475  	w.lineBuf.Reset()
   476  	w.lines++
   477  	if w.lines >= bufferedLines || w.flushBuf.Len() >= bufSize {
   478  		w.Flush()
   479  	}
   480  }
   481  
   482  // Close implements bufferedMetricsWriter.Close.
   483  // It writes the checksum of the data written to the underlying writer.
   484  func (w *lossyBufferedWriter[T]) Close() error {
   485  	w.Flush()
   486  	w.flushBuf.WriteString(MetricsPrefix)
   487  	w.flushBuf.WriteString(MetricsHashIndicator)
   488  	w.flushBuf.WriteString("0x")
   489  	prometheus.WriteHex(&w.flushBuf, uint64(w.overallHasher.Sum32()))
   490  	w.flushBuf.WriteString("\n")
   491  	w.underlying.WriteString(w.flushBuf.String())
   492  	w.overallHasher.Reset()
   493  	w.lineBuf.Reset()
   494  	w.flushBuf.Reset()
   495  	return w.underlying.Close()
   496  }
   497  
   498  // writeProfilingMetrics will write to the ProfilingMetricsWriter on every
   499  // request via writeReqs, until writeReqs is closed.
   500  func writeProfilingMetrics[T bufferedMetricsWriter](sink T, s *snapshots, headers []string, writeReqs <-chan writeReq) {
   501  	numEntries := s.numMetrics + 1
   502  	for _, header := range headers {
   503  		sink.WriteString(header)
   504  		sink.NewLine()
   505  	}
   506  	for req := range writeReqs {
   507  		s.curWriterIndex.Store(int32(req.ringbufferIdx))
   508  		ringBuf := s.ringbuffer[req.ringbufferIdx]
   509  		for i := 0; i < req.numLines; i++ {
   510  			base := i * numEntries
   511  			// Write the time
   512  			prometheus.WriteInteger(sink, int64(ringBuf[base]))
   513  			// Then everything else
   514  			for j := 1; j < numEntries; j++ {
   515  				sink.WriteString("\t")
   516  				prometheus.WriteInteger(sink, int64(ringBuf[base+j]))
   517  			}
   518  			sink.NewLine()
   519  		}
   520  	}
   521  	sink.Close()
   522  
   523  	doneProfilingMetrics <- true
   524  	close(doneProfilingMetrics)
   525  	profilingMetricsStarted.Store(false)
   526  }
   527  
   528  // StopProfilingMetrics stops the profiling metrics goroutines. Call to make sure
   529  // all metric data has been flushed.
   530  // Note that calling this function prior to StartProfilingMetrics has no effect.
   531  func StopProfilingMetrics() {
   532  	if !profilingMetricsStarted.Load() {
   533  		return
   534  	}
   535  	if stopProfilingMetrics.CompareAndSwap(false, true) {
   536  		<-doneProfilingMetrics
   537  	}
   538  	// If the CAS fails, this means the signal was already sent,
   539  	// so don't wait on doneProfilingMetrics.
   540  }