github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/metric/profiling_metric.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package metric
    16  
    17  import (
    18  	"bufio"
    19  	"errors"
    20  	"fmt"
    21  	"os"
    22  	"strings"
    23  	"time"
    24  
    25  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    26  	"github.com/MerlinKodo/gvisor/pkg/log"
    27  	"github.com/MerlinKodo/gvisor/pkg/prometheus"
    28  )
    29  
    30  const (
    31  	snapshotBufferSize     = 1000
    32  	snapshotRingbufferSize = 16
    33  )
    34  
    35  var (
    36  	// ProfilingMetricWriter is the output destination to which
    37  	// ProfilingMetrics will be written to in TSV format.
    38  	ProfilingMetricWriter *os.File
    39  	// profilingMetricsStarted indicates whether StartProfilingMetrics has
    40  	// been called.
    41  	profilingMetricsStarted atomicbitops.Bool
    42  	// stopProfilingMetrics is used to signal to the profiling metrics
    43  	// goroutine to stop recording and writing metrics.
    44  	stopProfilingMetrics chan bool
    45  	// doneProfilingMetrics is used to signal that the profiling metrics
    46  	// goroutines are finished.
    47  	doneProfilingMetrics chan bool
    48  	// definedProfilingMetrics is the set of metrics known to be created for
    49  	// profiling (see condmetric_profiling.go).
    50  	definedProfilingMetrics []string
    51  )
    52  
    53  // snapshots is used to as temporary storage of metric data
    54  // before it's written to the ProfilingMetricWriter.
    55  type snapshots struct {
    56  	numMetrics int
    57  	// ringbuffer is used to store metric data.
    58  	ringbuffer [][]uint64
    59  	// curWriterIndex is the ringbuffer index currently being read by the
    60  	// writer. It should not be used by the collector.
    61  	curWriterIndex atomicbitops.Int32
    62  }
    63  
    64  // writeReq is the message sent between from the collector to the writer.
    65  type writeReq struct {
    66  	ringbufferIdx int
    67  	// numLines indicates how many data lines are filled in the buffer.
    68  	numLines int
    69  }
    70  
    71  // StartProfilingMetrics checks the ProfilingMetrics runsc flags and creates
    72  // goroutines responsible for outputting the profiling metric data.
    73  //
    74  // Preconditions:
    75  //   - All metrics are registered.
    76  //   - Initialize/Disable has been called.
    77  func StartProfilingMetrics(profilingMetrics string, profilingRate time.Duration) error {
    78  	if !initialized.Load() {
    79  		// Wait for initialization to complete to make sure that all
    80  		// metrics are registered.
    81  		return errors.New("metric initialization is not complete")
    82  	}
    83  	if ProfilingMetricWriter == nil {
    84  		return errors.New("tried to initialize profiling metrics without log file")
    85  	}
    86  
    87  	var values []func(fieldValues ...*FieldValue) uint64
    88  	header := strings.Builder{}
    89  	header.WriteString("Time (ns)")
    90  	numMetrics := 0
    91  
    92  	if len(profilingMetrics) > 0 {
    93  		metrics := strings.Split(profilingMetrics, ",")
    94  		numMetrics = len(metrics)
    95  
    96  		for _, name := range metrics {
    97  			name := strings.TrimSpace(name)
    98  			m, ok := allMetrics.uint64Metrics[name]
    99  			if !ok {
   100  				return fmt.Errorf("given profiling metric name '%s' does not correspond to a registered Uint64 metric", name)
   101  			}
   102  			if len(m.fields) > 0 {
   103  				// TODO(b/240280155): Add support for field values.
   104  				return fmt.Errorf("will not profile metric '%s' because it has metric fields which are not supported", name)
   105  			}
   106  			header.WriteRune('\t')
   107  			header.WriteString(name)
   108  			values = append(values, m.value)
   109  		}
   110  
   111  		header.WriteRune('\n')
   112  	} else {
   113  		if len(definedProfilingMetrics) > 0 {
   114  			return fmt.Errorf("a value for --profiling-metrics was not specified; consider using a subset of '--profiling-metrics=%s'", strings.Join(definedProfilingMetrics, ","))
   115  		}
   116  		return fmt.Errorf("a value for --profiling-metrics was not specified; also no conditionally compiled metrics found, consider compiling runsc with --go_tag=condmetric_profiling")
   117  	}
   118  
   119  	if !profilingMetricsStarted.CompareAndSwap(0, 1) {
   120  		return errors.New("profiling metrics have already been started")
   121  	}
   122  	s := snapshots{
   123  		numMetrics: numMetrics,
   124  		ringbuffer: make([][]uint64, snapshotRingbufferSize),
   125  		// curWriterIndex is initialized to a valid index so that the
   126  		// collector cannot use up all indices before the writer even has
   127  		// a chance to start (as unlikely as that is).
   128  		curWriterIndex: atomicbitops.FromInt32(snapshotRingbufferSize - 1),
   129  	}
   130  	for i := 0; i < snapshotRingbufferSize; i++ {
   131  		s.ringbuffer[i] = make([]uint64, snapshotBufferSize*(numMetrics+1))
   132  	}
   133  
   134  	stopProfilingMetrics = make(chan bool, 1)
   135  	doneProfilingMetrics = make(chan bool, 1)
   136  	writeCh := make(chan writeReq, snapshotRingbufferSize)
   137  	go collectProfilingMetrics(&s, values, profilingRate, writeCh)
   138  	go writeProfilingMetrics(&s, header.String(), writeCh)
   139  
   140  	return nil
   141  }
   142  
   143  // collectProfilingMetrics will send metrics to the writeCh until it receives a
   144  // signal via the stopProfilingMetrics channel.
   145  func collectProfilingMetrics(s *snapshots, values []func(fieldValues ...*FieldValue) uint64, profilingRate time.Duration, writeCh chan<- writeReq) {
   146  	defer close(writeCh)
   147  
   148  	numEntries := s.numMetrics + 1 // to account for the timestamp
   149  	ringbufferIdx := 0
   150  	curSnapshot := 0
   151  	startTime := CheapNowNano()
   152  	// getNewRingbufferIdx will block until the writer indicates that some part
   153  	// of the ringbuffer is available for writing.
   154  	getNewRingbufferIdx := func() {
   155  		for {
   156  			nextIdx := (ringbufferIdx + 1) % snapshotRingbufferSize
   157  			if nextIdx != int(s.curWriterIndex.Load()) {
   158  				ringbufferIdx = nextIdx
   159  				break
   160  			}
   161  			// Going too fast, stop collecting for a bit.
   162  			log.Warningf("Profiling metrics collector exhausted the entire ringbuffer... backing off to let writer catch up.")
   163  			time.Sleep(profilingRate * 100)
   164  		}
   165  	}
   166  
   167  	stopCollecting := false
   168  	for nextCollection := CheapNowNano() + profilingRate.Nanoseconds(); !stopCollecting; nextCollection += profilingRate.Nanoseconds() {
   169  		now := CheapNowNano()
   170  		if now < nextCollection {
   171  			time.Sleep(time.Duration(nextCollection-now) * time.Nanosecond)
   172  		} else {
   173  			// Skip collection since we just did one anyway.
   174  			continue
   175  		}
   176  
   177  		select {
   178  		case <-stopProfilingMetrics:
   179  			stopCollecting = true
   180  			// Collect one last time before stopping.
   181  		default:
   182  		}
   183  
   184  		collectStart := CheapNowNano()
   185  		timestamp := time.Duration(collectStart - startTime)
   186  		base := curSnapshot * numEntries
   187  		s.ringbuffer[ringbufferIdx][base] = uint64(timestamp)
   188  		for i := 1; i < numEntries; i++ {
   189  			s.ringbuffer[ringbufferIdx][base+i] = values[i-1]()
   190  		}
   191  		curSnapshot++
   192  
   193  		if curSnapshot == snapshotBufferSize {
   194  			writeCh <- writeReq{ringbufferIdx: ringbufferIdx, numLines: curSnapshot}
   195  			curSnapshot = 0
   196  			getNewRingbufferIdx()
   197  		}
   198  	}
   199  	if curSnapshot != 0 {
   200  		writeCh <- writeReq{ringbufferIdx: ringbufferIdx, numLines: curSnapshot}
   201  	}
   202  }
   203  
   204  // writeProfilingMetrics will write to the ProfilingMetricsWriter on every
   205  // request via writeReqs, until writeReqs is closed.
   206  func writeProfilingMetrics(s *snapshots, header string, writeReqs <-chan writeReq) {
   207  	numEntries := s.numMetrics + 1
   208  
   209  	out := bufio.NewWriter(ProfilingMetricWriter)
   210  	out.WriteString(header)
   211  
   212  	for req := range writeReqs {
   213  		s.curWriterIndex.Store(int32(req.ringbufferIdx))
   214  
   215  		for i := 0; i < req.numLines; i++ {
   216  			base := i * numEntries
   217  			// Write the time
   218  			prometheus.WriteInteger(out, int64(s.ringbuffer[req.ringbufferIdx][base]))
   219  			// Then everything else
   220  			for j := 1; j < numEntries; j++ {
   221  				out.WriteRune('\t')
   222  				prometheus.WriteInteger(out, int64(s.ringbuffer[req.ringbufferIdx][base+j]))
   223  			}
   224  			out.WriteRune('\n')
   225  		}
   226  	}
   227  
   228  	out.Flush()
   229  	ProfilingMetricWriter.Close()
   230  	doneProfilingMetrics <- true
   231  	close(doneProfilingMetrics)
   232  
   233  	profilingMetricsStarted.Store(false)
   234  }
   235  
   236  // StopProfilingMetrics stops the profiling metrics goroutines. Call to make sure
   237  // all metric data has been flushed.
   238  // Note that calling this function prior to StartProfilingMetrics has no effect.
   239  func StopProfilingMetrics() {
   240  	if !profilingMetricsStarted.Load() {
   241  		return
   242  	}
   243  
   244  	select {
   245  	case stopProfilingMetrics <- true:
   246  		<-doneProfilingMetrics
   247  	default: // Stop signal was already sent
   248  	}
   249  }