github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/metric/metric.go

github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/metric/metric.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package metric provides primitives for collecting metrics.
    16  package metric
    17  
    18  import (
    19  	"errors"
    20  	"fmt"
    21  	"math"
    22  	re "regexp"
    23  	"sort"
    24  	"strings"
    25  	"time"
    26  
    27  	"google.golang.org/protobuf/types/known/timestamppb"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/eventchannel"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    31  	pb "github.com/nicocha30/gvisor-ligolo/pkg/metric/metric_go_proto"
    32  	"github.com/nicocha30/gvisor-ligolo/pkg/prometheus"
    33  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    34  )
    35  
    36  var (
    37  	// ErrNameInUse indicates that another metric is already defined for
    38  	// the given name.
    39  	ErrNameInUse = errors.New("metric name already in use")
    40  
    41  	// ErrInitializationDone indicates that the caller tried to create a
    42  	// new metric after initialization.
    43  	ErrInitializationDone = errors.New("metric cannot be created after initialization is complete")
    44  
    45  	// ErrFieldValueContainsIllegalChar indicates that the value of a metric
    46  	// field had an invalid character in it.
    47  	ErrFieldValueContainsIllegalChar = errors.New("metric field value contains illegal character")
    48  
    49  	// ErrFieldHasNoAllowedValues indicates that the field needs to define some
    50  	// allowed values to be a valid and useful field.
    51  	ErrFieldHasNoAllowedValues = errors.New("metric field does not define any allowed values")
    52  
    53  	// ErrTooManyFieldCombinations indicates that the number of unique
    54  	// combinations of fields is too large to support.
    55  	ErrTooManyFieldCombinations = errors.New("metric has too many combinations of allowed field values")
    56  )
    57  
    58  // Weirdness metric type constants.
    59  var (
    60  	WeirdnessTypeTimeFallback         = FieldValue{"time_fallback"}
    61  	WeirdnessTypePartialResult        = FieldValue{"partial_result"}
    62  	WeirdnessTypeVsyscallCount        = FieldValue{"vsyscall_count"}
    63  	WeirdnessTypeWatchdogStuckStartup = FieldValue{"watchdog_stuck_startup"}
    64  	WeirdnessTypeWatchdogStuckTasks   = FieldValue{"watchdog_stuck_tasks"}
    65  )
    66  
    67  // Suspicious operations metric type constants.
    68  var (
    69  	SuspiciousOperationsTypeOpenedWriteExecuteFile = FieldValue{"opened_write_execute_file"}
    70  )
    71  
    72  // List of global metrics that are used in multiple places.
    73  var (
    74  	// WeirdnessMetric is a metric with fields created to track the number
    75  	// of weird occurrences such as time fallback, partial_result, vsyscall
    76  	// count, watchdog startup timeouts and stuck tasks.
    77  	WeirdnessMetric = MustCreateNewUint64Metric("/weirdness", true /* sync */, "Increment for weird occurrences of problems such as time fallback, partial result, vsyscalls invoked in the sandbox, watchdog startup timeouts and stuck tasks.",
    78  		NewField("weirdness_type",
    79  			&WeirdnessTypeTimeFallback,
    80  			&WeirdnessTypePartialResult,
    81  			&WeirdnessTypeVsyscallCount,
    82  			&WeirdnessTypeWatchdogStuckStartup,
    83  			&WeirdnessTypeWatchdogStuckTasks,
    84  		))
    85  
    86  	// SuspiciousOperationsMetric is a metric with fields created to detect
    87  	// operations such as opening an executable file to write from a gofer.
    88  	SuspiciousOperationsMetric = MustCreateNewUint64Metric("/suspicious_operations", true /* sync */, "Increment for suspicious operations such as opening an executable file to write from a gofer.",
    89  		NewField("operation_type",
    90  			&SuspiciousOperationsTypeOpenedWriteExecuteFile,
    91  		))
    92  )
    93  
    94  // InitStage is the name of a Sentry initialization stage.
    95  type InitStage string
    96  
    97  // List of all Sentry initialization stages.
    98  var (
    99  	InitRestoreConfig InitStage = "restore_config"
   100  	InitExecConfig    InitStage = "exec_config"
   101  	InitRestore       InitStage = "restore"
   102  	InitCreateProcess InitStage = "create_process"
   103  	InitTaskStart     InitStage = "task_start"
   104  
   105  	// allStages is the list of allowed stages.
   106  	allStages = []InitStage{
   107  		InitRestoreConfig,
   108  		InitExecConfig,
   109  		InitRestore,
   110  		InitCreateProcess,
   111  		InitTaskStart,
   112  	}
   113  )
   114  
   115  // Uint64Metric encapsulates a uint64 that represents some kind of metric to be
   116  // monitored.
   117  //
   118  // Metrics are not saved across save/restore and thus reset to zero on restore.
   119  type Uint64Metric struct {
   120  	name string
   121  
   122  	// fields is the map of field-value combination index keys to Uint64 counters.
   123  	fields []atomicbitops.Uint64
   124  
   125  	// fieldMapper is used to generate index keys for the fields array (above)
   126  	// based on field value combinations, and vice-versa.
   127  	fieldMapper fieldMapper
   128  }
   129  
   130  var (
   131  	// initialized indicates that all metrics are registered. allMetrics is
   132  	// immutable once initialized is true.
   133  	initialized atomicbitops.Bool
   134  
   135  	// allMetrics are the registered metrics.
   136  	allMetrics = makeMetricSet()
   137  )
   138  
   139  // Initialize sends a metric registration event over the event channel.
   140  //
   141  // Precondition:
   142  //   - All metrics are registered.
   143  //   - Initialize/Disable has not been called.
   144  func Initialize() error {
   145  	if initialized.Load() {
   146  		return errors.New("metric.Initialize called after metric.Initialize or metric.Disable")
   147  	}
   148  
   149  	m := pb.MetricRegistration{}
   150  	for _, v := range allMetrics.uint64Metrics {
   151  		m.Metrics = append(m.Metrics, v.metadata)
   152  	}
   153  	for _, v := range allMetrics.distributionMetrics {
   154  		m.Metrics = append(m.Metrics, v.metadata)
   155  	}
   156  	m.Stages = make([]string, 0, len(allStages))
   157  	for _, s := range allStages {
   158  		m.Stages = append(m.Stages, string(s))
   159  	}
   160  	allMetrics.registration = &m
   161  	if err := eventchannel.Emit(&m); err != nil {
   162  		return fmt.Errorf("unable to emit metric initialize event: %w", err)
   163  	}
   164  
   165  	if initialized.Swap(true) {
   166  		return errors.New("raced with another call to metric.Initialize or metric.Disable")
   167  	}
   168  	return nil
   169  }
   170  
   171  // ErrNotYetInitialized is returned by GetMetricRegistration if metrics are not yet initialized.
   172  var ErrNotYetInitialized = errors.New("metrics are not yet initialized")
   173  
   174  // GetMetricRegistration returns the metric registration data for all registered metrics.
   175  // Must be called after Initialize().
   176  // Returns ErrNotYetInitialized if metrics are not yet initialized.
   177  func GetMetricRegistration() (*pb.MetricRegistration, error) {
   178  	if !initialized.Load() {
   179  		return nil, ErrNotYetInitialized
   180  	}
   181  	if allMetrics.registration == nil {
   182  		return nil, errors.New("metrics are disabled")
   183  	}
   184  	return allMetrics.registration, nil
   185  }
   186  
   187  // Disable sends an empty metric registration event over the event channel,
   188  // disabling metric collection.
   189  //
   190  // Precondition:
   191  //   - All metrics are registered.
   192  //   - Initialize/Disable has not been called.
   193  func Disable() error {
   194  	if initialized.Load() {
   195  		return errors.New("metric.Disable called after metric.Initialize or metric.Disable")
   196  	}
   197  
   198  	m := pb.MetricRegistration{}
   199  	if err := eventchannel.Emit(&m); err != nil {
   200  		return fmt.Errorf("unable to emit empty metric registration event (metrics disabled): %w", err)
   201  	}
   202  
   203  	if initialized.Swap(true) {
   204  		return errors.New("raced with another call to metric.Initialize or metric.Disable")
   205  	}
   206  	return nil
   207  }
   208  
   209  type customUint64Metric struct {
   210  	// metadata describes the metric. It is immutable.
   211  	metadata *pb.MetricMetadata
   212  
   213  	// prometheusMetric describes the metric in Prometheus format. It is immutable.
   214  	prometheusMetric *prometheus.Metric
   215  
   216  	// fields is the set of fields of the metric.
   217  	fields []Field
   218  
   219  	// value returns the current value of the metric for the given set of
   220  	// fields. It takes a variadic number of field values as argument.
   221  	value func(fieldValues ...*FieldValue) uint64
   222  
   223  	// forEachNonZero calls the given function on each possible field value of
   224  	// the metric where the metric's value is non-zero.
   225  	// The passed-in function should not allocate new memory, and may not save
   226  	// or modify `fields` directly, as the slice memory is reused across calls.
   227  	// `forEachNonZero` does not guarantee that it will be called on a
   228  	// consistent snapshot of this metric's values.
   229  	// `forEachNonZero` may be nil.
   230  	forEachNonZero func(f func(fields []*FieldValue, val uint64))
   231  }
   232  
   233  // FieldValue is a string that can be used as a value for a Field.
   234  // It must be referred to by address when the Field is created and when its
   235  // metric value is modified. This ensures that the same FieldValue reference
   236  // is used, which in turn enables the metric code to use the address of a
   237  // FieldValue as comparison operator, rather than doing string comparisons.
   238  type FieldValue struct {
   239  	Value string
   240  }
   241  
   242  // fieldMapperMapThreshold is the number of field values after which we switch
   243  // to using map lookups when looking up field values.
   244  // This value was determined using benchmarks to see which is fastest.
   245  const fieldMapperMapThreshold = 48
   246  
   247  // Field contains the field name and allowed values for the metric which is
   248  // used in registration of the metric.
   249  type Field struct {
   250  	// name is the metric field name.
   251  	name string
   252  
   253  	// values is the list of values for the field.
   254  	// `values` is always populated but not always used for lookup. It depends
   255  	// on the number of allowed field values. `values` is used for lookups on
   256  	// fields with small numbers of field values.
   257  	values []*FieldValue
   258  
   259  	// valuesPtrMap is a map version of `values`. For each item in `values`,
   260  	// its pointer is mapped to its index within `values`.
   261  	// `valuesPtrMap` is used for fields with large numbers of possible values.
   262  	// For fields with small numbers of field values, it is nil.
   263  	// This map allows doing faster string matching than a normal string map,
   264  	// as it avoids the string hashing step that normal string maps need to do.
   265  	valuesPtrMap map[*FieldValue]int
   266  }
   267  
   268  // toProto returns the proto definition of this field, for use in metric
   269  // metadata.
   270  func (f Field) toProto() *pb.MetricMetadata_Field {
   271  	allowedValues := make([]string, len(f.values))
   272  	for i, v := range f.values {
   273  		allowedValues[i] = v.Value
   274  	}
   275  	return &pb.MetricMetadata_Field{
   276  		FieldName:     f.name,
   277  		AllowedValues: allowedValues,
   278  	}
   279  }
   280  
   281  // NewField defines a new Field that can be used to break down a metric.
   282  // The set of allowedValues must be unique strings wrapped with `FieldValue`.
   283  // The *same* `FieldValue` pointers must be used during metric modifications.
   284  // In practice, in most cases, this means you should declare these
   285  // `FieldValue`s as package-level `var`s, and always use the address of these
   286  // package-level `var`s during metric modifications.
   287  func NewField(name string, allowedValues ...*FieldValue) Field {
   288  	// Verify that all string values have a unique value.
   289  	strMap := make(map[string]bool, len(allowedValues))
   290  	for _, v := range allowedValues {
   291  		if strMap[v.Value] {
   292  			panic(fmt.Sprintf("found duplicate field value: %q", v))
   293  		}
   294  		strMap[v.Value] = true
   295  	}
   296  
   297  	if useMap := len(allowedValues) > fieldMapperMapThreshold; !useMap {
   298  		return Field{
   299  			name:   name,
   300  			values: allowedValues,
   301  		}
   302  	}
   303  
   304  	valuesPtrMap := make(map[*FieldValue]int, len(allowedValues))
   305  	for i, v := range allowedValues {
   306  		valuesPtrMap[v] = i
   307  	}
   308  	return Field{
   309  		name:         name,
   310  		values:       allowedValues,
   311  		valuesPtrMap: valuesPtrMap,
   312  	}
   313  }
   314  
   315  // fieldMapper provides multi-dimensional fields to a single unique integer key
   316  type fieldMapper struct {
   317  	// fields is a list of Field objects, which importantly include individual
   318  	// Field names which are used to perform the keyToMultiField function; and
   319  	// allowedValues for each field type which are used to perform the lookup
   320  	// function.
   321  	fields []Field
   322  
   323  	// numFieldCombinations is the number of unique keys for all possible field
   324  	// combinations.
   325  	numFieldCombinations int
   326  }
   327  
   328  // newFieldMapper returns a new fieldMapper for the given set of fields.
   329  func newFieldMapper(fields ...Field) (fieldMapper, error) {
   330  	numFieldCombinations := 1
   331  	for _, f := range fields {
   332  		// Disallow fields with no possible values. We could also ignore them
   333  		// instead, but passing in a no-allowed-values field is probably a mistake.
   334  		if len(f.values) == 0 {
   335  			return fieldMapper{nil, 0}, ErrFieldHasNoAllowedValues
   336  		}
   337  		numFieldCombinations *= len(f.values)
   338  
   339  		// Sanity check, could be useful in case someone dynamically generates too
   340  		// many fields accidentally.
   341  		if numFieldCombinations > math.MaxUint32 || numFieldCombinations < 0 {
   342  			return fieldMapper{nil, 0}, ErrTooManyFieldCombinations
   343  		}
   344  	}
   345  
   346  	return fieldMapper{
   347  		fields:               fields,
   348  		numFieldCombinations: numFieldCombinations,
   349  	}, nil
   350  }
   351  
   352  // lookupSingle looks up a single key for a single field within fieldMapper.
   353  // It is used internally within lookupConcat.
   354  // It returns the updated `idx` and `remainingCombinationBucket` values.
   355  // +checkescape:all
   356  //
   357  //go:nosplit
   358  func (m fieldMapper) lookupSingle(fieldIndex int, fieldValue *FieldValue, idx, remainingCombinationBucket int) (int, int) {
   359  	field := m.fields[fieldIndex]
   360  	numValues := len(field.values)
   361  
   362  	// Are we doing a linear search?
   363  	if field.valuesPtrMap == nil {
   364  		// We scan by pointers only. This means the caller must pass the same
   365  		// FieldValue pointer as the one used in `NewField`.
   366  		for valIdx, allowedVal := range field.values {
   367  			if fieldValue == allowedVal {
   368  				remainingCombinationBucket /= numValues
   369  				idx += remainingCombinationBucket * valIdx
   370  				return idx, remainingCombinationBucket
   371  			}
   372  		}
   373  		panic("invalid field value or did not reuse the same FieldValue pointer as passed in NewField")
   374  	}
   375  
   376  	// Use map lookup instead.
   377  
   378  	// Match using FieldValue pointer.
   379  	// This avoids the string hashing step that string maps otherwise do.
   380  	valIdx, found := field.valuesPtrMap[fieldValue]
   381  	if found {
   382  		remainingCombinationBucket /= numValues
   383  		idx += remainingCombinationBucket * valIdx
   384  		return idx, remainingCombinationBucket
   385  	}
   386  
   387  	panic("invalid field value or did not reuse the same FieldValue pointer as passed in NewField")
   388  }
   389  
   390  // lookupConcat looks up a key within the fieldMapper where the fields are
   391  // the concatenation of two list of fields.
   392  // The returned key is an index that can be used to access to map created by
   393  // makeMap().
   394  // This *must* be called with the correct number of fields, or it will panic.
   395  // +checkescape:all
   396  //
   397  //go:nosplit
   398  func (m fieldMapper) lookupConcat(fields1, fields2 []*FieldValue) int {
   399  	if (len(fields1) + len(fields2)) != len(m.fields) {
   400  		panic("invalid field lookup depth")
   401  	}
   402  	idx := 0
   403  	remainingCombinationBucket := m.numFieldCombinations
   404  	for i, val := range fields1 {
   405  		idx, remainingCombinationBucket = m.lookupSingle(i, val, idx, remainingCombinationBucket)
   406  	}
   407  
   408  	numFields1 := len(fields1)
   409  	for i, val := range fields2 {
   410  		idx, remainingCombinationBucket = m.lookupSingle(i+numFields1, val, idx, remainingCombinationBucket)
   411  	}
   412  
   413  	return idx
   414  }
   415  
   416  // lookup looks up a key within the fieldMapper.
   417  // The returned key is an index that can be used to access to map created by
   418  // makeMap().
   419  // This *must* be called with the correct number of fields, or it will panic.
   420  // +checkescape:all
   421  //
   422  //go:nosplit
   423  func (m fieldMapper) lookup(fields ...*FieldValue) int {
   424  	return m.lookupConcat(fields, nil)
   425  }
   426  
   427  // numKeys returns the total number of key-to-field-combinations mappings
   428  // defined by the fieldMapper.
   429  //
   430  //go:nosplit
   431  func (m fieldMapper) numKeys() int {
   432  	return m.numFieldCombinations
   433  }
   434  
   435  // makeDistributionSampleMap creates a two dimensional array, where:
   436  //   - The first level corresponds to unique field value combinations and is
   437  //     accessed using index "keys" made by fieldMapper.
   438  //   - The second level corresponds to buckets within a metric. The number of
   439  //     buckets is specified by numBuckets.
   440  func (m fieldMapper) makeDistributionSampleMap(numBuckets int) [][]atomicbitops.Uint64 {
   441  	samples := make([][]atomicbitops.Uint64, m.numKeys())
   442  	for i := range samples {
   443  		samples[i] = make([]atomicbitops.Uint64, numBuckets)
   444  	}
   445  	return samples
   446  }
   447  
   448  // keyToMultiField is the reverse of lookup/lookupConcat. The returned list of
   449  // field values corresponds to the same order of fields that were passed in to
   450  // newFieldMapper.
   451  func (m fieldMapper) keyToMultiField(key int) []string {
   452  	depth := len(m.fields)
   453  	if depth == 0 && key == 0 {
   454  		return nil
   455  	}
   456  	fieldValues := make([]string, depth)
   457  	remainingCombinationBucket := m.numFieldCombinations
   458  	for i := 0; i < depth; i++ {
   459  		remainingCombinationBucket /= len(m.fields[i].values)
   460  		fieldValues[i] = m.fields[i].values[key/remainingCombinationBucket].Value
   461  		key = key % remainingCombinationBucket
   462  	}
   463  	return fieldValues
   464  }
   465  
   466  // keyToMultiFieldInPlace does the operation described in `keyToMultiField`
   467  // but modifies `fieldValues` in-place. It must already be of size
   468  // `len(m.fields)`.
   469  //
   470  //go:nosplit
   471  func (m fieldMapper) keyToMultiFieldInPlace(key int, fieldValues []*FieldValue) {
   472  	if len(m.fields) == 0 {
   473  		return
   474  	}
   475  	depth := len(m.fields)
   476  	remainingCombinationBucket := m.numFieldCombinations
   477  	for i := 0; i < depth; i++ {
   478  		remainingCombinationBucket /= len(m.fields[i].values)
   479  		fieldValues[i] = m.fields[i].values[key/remainingCombinationBucket]
   480  		key = key % remainingCombinationBucket
   481  	}
   482  }
   483  
   484  // nameToPrometheusName transforms a path-style metric name (/foo/bar) into a Prometheus-style
   485  // metric name (foo_bar).
   486  func nameToPrometheusName(name string) string {
   487  	return strings.ReplaceAll(strings.TrimPrefix(name, "/"), "/", "_")
   488  }
   489  
   490  var validMetricNameRegexp = re.MustCompile("^(?:/[_\\w]+)+$")
   491  
   492  // verifyName verifies that the given metric name is a valid path-style metric
   493  // name.
   494  func verifyName(name string) error {
   495  	if !strings.HasPrefix(name, "/") {
   496  		return fmt.Errorf("metric name must start with a '/': %q", name)
   497  	}
   498  	if !validMetricNameRegexp.MatchString(name) {
   499  		return fmt.Errorf("invalid metric name: %q", name)
   500  	}
   501  	return nil
   502  }
   503  
   504  // RegisterCustomUint64Metric registers a metric with the given name.
   505  //
   506  // Register must only be called at init and will return and error if called
   507  // after Initialized.
   508  //
   509  // Preconditions:
   510  //   - name must be globally unique.
   511  //   - Initialize/Disable have not been called.
   512  //   - value is expected to accept exactly len(fields) arguments.
   513  func RegisterCustomUint64Metric(name string, cumulative, sync bool, units pb.MetricMetadata_Units, description string, value func(...*FieldValue) uint64, fields ...Field) error {
   514  	if initialized.Load() {
   515  		return ErrInitializationDone
   516  	}
   517  
   518  	if _, ok := allMetrics.uint64Metrics[name]; ok {
   519  		return ErrNameInUse
   520  	}
   521  	if _, ok := allMetrics.distributionMetrics[name]; ok {
   522  		return ErrNameInUse
   523  	}
   524  
   525  	promType := prometheus.TypeGauge
   526  	if cumulative {
   527  		promType = prometheus.TypeCounter
   528  	}
   529  
   530  	allMetrics.uint64Metrics[name] = customUint64Metric{
   531  		metadata: &pb.MetricMetadata{
   532  			Name:           name,
   533  			PrometheusName: nameToPrometheusName(name),
   534  			Description:    description,
   535  			Cumulative:     cumulative,
   536  			Sync:           sync,
   537  			Type:           pb.MetricMetadata_TYPE_UINT64,
   538  			Units:          units,
   539  		},
   540  		prometheusMetric: &prometheus.Metric{
   541  			Name: nameToPrometheusName(name),
   542  			Help: description,
   543  			Type: promType,
   544  		},
   545  		fields: fields,
   546  		value:  value,
   547  	}
   548  
   549  	// Metrics can exist without fields.
   550  	if l := len(fields); l > 1 {
   551  		return fmt.Errorf("%d fields provided, must be <= 1", l)
   552  	}
   553  
   554  	for _, field := range fields {
   555  		allMetrics.uint64Metrics[name].metadata.Fields = append(allMetrics.uint64Metrics[name].metadata.Fields, field.toProto())
   556  	}
   557  	return nil
   558  }
   559  
   560  // MustRegisterCustomUint64Metric calls RegisterCustomUint64Metric for metrics
   561  // without fields and panics if it returns an error.
   562  func MustRegisterCustomUint64Metric(name string, cumulative, sync bool, description string, value func(...*FieldValue) uint64, fields ...Field) {
   563  	if err := RegisterCustomUint64Metric(name, cumulative, sync, pb.MetricMetadata_UNITS_NONE, description, value, fields...); err != nil {
   564  		panic(fmt.Sprintf("Unable to register metric %q: %s", name, err))
   565  	}
   566  }
   567  
   568  // NewUint64Metric creates and registers a new cumulative metric with the given
   569  // name.
   570  //
   571  // Metrics must be statically defined (i.e., at init).
   572  func NewUint64Metric(name string, sync bool, units pb.MetricMetadata_Units, description string, fields ...Field) (*Uint64Metric, error) {
   573  	if err := verifyName(name); err != nil {
   574  		return nil, err
   575  	}
   576  	f, err := newFieldMapper(fields...)
   577  	if err != nil {
   578  		return nil, err
   579  	}
   580  	m := Uint64Metric{
   581  		name:        name,
   582  		fieldMapper: f,
   583  		fields:      make([]atomicbitops.Uint64, f.numKeys()),
   584  	}
   585  	if err := RegisterCustomUint64Metric(name, true /* cumulative */, sync, units, description, m.Value, fields...); err != nil {
   586  		return nil, err
   587  	}
   588  	cm := allMetrics.uint64Metrics[name]
   589  	cm.forEachNonZero = m.forEachNonZero
   590  	allMetrics.uint64Metrics[name] = cm
   591  	return &m, nil
   592  }
   593  
   594  // MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns
   595  // an error.
   596  func MustCreateNewUint64Metric(name string, sync bool, description string, fields ...Field) *Uint64Metric {
   597  	m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NONE, description, fields...)
   598  	if err != nil {
   599  		panic(fmt.Sprintf("Unable to create metric %q: %s", name, err))
   600  	}
   601  	return m
   602  }
   603  
   604  // MustCreateNewUint64NanosecondsMetric calls NewUint64Metric and panics if it
   605  // returns an error.
   606  func MustCreateNewUint64NanosecondsMetric(name string, sync bool, description string) *Uint64Metric {
   607  	m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NANOSECONDS, description)
   608  	if err != nil {
   609  		panic(fmt.Sprintf("Unable to create metric %q: %s", name, err))
   610  	}
   611  	return m
   612  }
   613  
   614  // Value returns the current value of the metric for the given set of fields.
   615  // This must be called with the correct number of field values or it will panic.
   616  //
   617  //go:nosplit
   618  func (m *Uint64Metric) Value(fieldValues ...*FieldValue) uint64 {
   619  	key := m.fieldMapper.lookupConcat(fieldValues, nil)
   620  	return m.fields[key].Load()
   621  }
   622  
   623  // forEachNonZero iterates over each field combination and calls the given
   624  // function whenever this metric's value is not zero.
   625  func (m *Uint64Metric) forEachNonZero(f func(fieldValues []*FieldValue, value uint64)) {
   626  	numCombinations := m.fieldMapper.numKeys()
   627  	if len(m.fieldMapper.fields) == 0 {
   628  		// Special-case the "there are no fields" case for speed and to avoid
   629  		// allocating a slice.
   630  		if val := m.fields[0].Load(); val != 0 {
   631  			f(nil, val)
   632  		}
   633  		return
   634  	}
   635  	var fieldValues []*FieldValue
   636  	for k := 0; k < numCombinations; k++ {
   637  		val := m.fields[k].Load()
   638  		if val == 0 {
   639  			continue
   640  		}
   641  		if fieldValues == nil {
   642  			fieldValues = make([]*FieldValue, len(m.fieldMapper.fields))
   643  		}
   644  		m.fieldMapper.keyToMultiFieldInPlace(k, fieldValues)
   645  		f(fieldValues, val)
   646  	}
   647  }
   648  
   649  // Increment increments the metric field by 1.
   650  // This must be called with the correct number of field values or it will panic.
   651  //
   652  //go:nosplit
   653  func (m *Uint64Metric) Increment(fieldValues ...*FieldValue) {
   654  	key := m.fieldMapper.lookupConcat(fieldValues, nil)
   655  	m.fields[key].Add(1)
   656  }
   657  
   658  // IncrementBy increments the metric by v.
   659  // This must be called with the correct number of field values or it will panic.
   660  //
   661  //go:nosplit
   662  func (m *Uint64Metric) IncrementBy(v uint64, fieldValues ...*FieldValue) {
   663  	key := m.fieldMapper.lookupConcat(fieldValues, nil)
   664  	m.fields[key].Add(v)
   665  }
   666  
   667  // Bucketer is an interface to bucket values into finite, distinct buckets.
   668  type Bucketer interface {
   669  	// NumFiniteBuckets is the number of finite buckets in the distribution.
   670  	// This is only called once and never expected to return a different value.
   671  	NumFiniteBuckets() int
   672  
   673  	// LowerBound takes the index of a bucket (within [0, NumBuckets()]) and
   674  	// returns the inclusive lower bound of that bucket.
   675  	// In other words, the lowest value of `x` for which `BucketIndex(x) == i`
   676  	// should be `x = LowerBound(i)`.
   677  	// The upper bound of a bucket is the lower bound of the next bucket.
   678  	// The last bucket (with `bucketIndex == NumFiniteBuckets()`) is infinite,
   679  	// i.e. it has no upper bound (but it still has a lower bound).
   680  	LowerBound(bucketIndex int) int64
   681  
   682  	// BucketIndex takes a sample and returns the index of the bucket that the
   683  	// sample should fall into.
   684  	// Must return either:
   685  	//   - A value within [0, NumBuckets() -1] if the sample falls within a
   686  	//     finite bucket
   687  	//   - NumBuckets() if the sample falls within the last (infinite) bucket
   688  	//   - '-1' if the sample is lower than what any bucket can represent, i.e.
   689  	//     the sample should be in the implicit "underflow" bucket.
   690  	// This function must be go:nosplit-compatible and have no escapes.
   691  	// +checkescape:all
   692  	BucketIndex(sample int64) int
   693  }
   694  
   695  // ExponentialBucketer implements Bucketer, with the first bucket starting
   696  // with 0 as lowest bound with `Width` width, and each subsequent bucket being
   697  // wider by a scaled exponentially-growing series, until `NumFiniteBuckets`
   698  // buckets exist.
   699  type ExponentialBucketer struct {
   700  	// numFinitebuckets is the total number of finite buckets in the scheme.
   701  	numFiniteBuckets int
   702  
   703  	// width is the size of the first (0-th) finite bucket.
   704  	width float64
   705  
   706  	// scale is a factor applied uniformly to the exponential growth portion
   707  	// of the bucket size.
   708  	scale float64
   709  
   710  	// growth is the exponential growth factor for finite buckets.
   711  	// The n-th bucket is `growth` times wider than the (n-1)-th bucket.
   712  	// Bucket sizes are floored, so `width` and `growth` must be large enough
   713  	// such that the second bucket is actually wider than the first after
   714  	// flooring (unless, of course, fixed-width buckets are what's desired).
   715  	growth float64
   716  
   717  	// growthLog is math.Log(growth).
   718  	growthLog float64
   719  
   720  	// maxSample is the max sample value which can be represented in a finite
   721  	// bucket.
   722  	maxSample int64
   723  
   724  	// lowerbounds is a precomputed set of lower bounds of the buckets.
   725  	// The "underflow" bucket has no lower bound, so it is not included here.
   726  	// lowerBounds[0] is the lower bound of the first finite bucket, which is
   727  	// also the upper bound of the underflow bucket.
   728  	// lowerBounds[numFiniteBuckets] is the lower bound of the overflow bucket.
   729  	lowerBounds []int64
   730  }
   731  
   732  // Minimum/maximum finite buckets for exponential bucketers.
   733  const (
   734  	exponentialMinBuckets = 1
   735  	exponentialMaxBuckets = 100
   736  )
   737  
   738  // NewExponentialBucketer returns a new Bucketer with exponential buckets.
   739  func NewExponentialBucketer(numFiniteBuckets int, width uint64, scale, growth float64) *ExponentialBucketer {
   740  	if numFiniteBuckets < exponentialMinBuckets || numFiniteBuckets > exponentialMaxBuckets {
   741  		panic(fmt.Sprintf("number of finite buckets must be in [%d, %d]", exponentialMinBuckets, exponentialMaxBuckets))
   742  	}
   743  	if scale < 0 || growth < 0 {
   744  		panic(fmt.Sprintf("scale and growth for exponential buckets must be >0, got scale=%f and growth=%f", scale, growth))
   745  	}
   746  	b := &ExponentialBucketer{
   747  		numFiniteBuckets: numFiniteBuckets,
   748  		width:            float64(width),
   749  		scale:            scale,
   750  		growth:           growth,
   751  		growthLog:        math.Log(growth),
   752  		lowerBounds:      make([]int64, numFiniteBuckets+1),
   753  	}
   754  	b.lowerBounds[0] = 0
   755  	for i := 1; i <= numFiniteBuckets; i++ {
   756  		b.lowerBounds[i] = int64(b.width*float64(i) + b.scale*math.Pow(b.growth, float64(i-1)))
   757  		if b.lowerBounds[i] < 0 {
   758  			panic(fmt.Sprintf("encountered bucket width overflow at bucket %d", i))
   759  		}
   760  	}
   761  	b.maxSample = b.lowerBounds[numFiniteBuckets] - 1
   762  	return b
   763  }
   764  
   765  // NumFiniteBuckets implements Bucketer.NumFiniteBuckets.
   766  func (b *ExponentialBucketer) NumFiniteBuckets() int {
   767  	return int(b.numFiniteBuckets)
   768  }
   769  
   770  // LowerBound implements Bucketer.LowerBound.
   771  func (b *ExponentialBucketer) LowerBound(bucketIndex int) int64 {
   772  	return b.lowerBounds[bucketIndex]
   773  }
   774  
   775  // BucketIndex implements Bucketer.BucketIndex.
   776  // +checkescape:all
   777  //
   778  //go:nosplit
   779  func (b *ExponentialBucketer) BucketIndex(sample int64) int {
   780  	if sample < 0 {
   781  		return -1
   782  	}
   783  	if sample == 0 {
   784  		return 0
   785  	}
   786  	if sample > b.maxSample {
   787  		return b.numFiniteBuckets
   788  	}
   789  	// Do a binary search. For the number of buckets we expect to deal with in
   790  	// this code (a few dozen at most), this may be faster than computing a
   791  	// logarithm. We can't use recursion because this would violate go:nosplit.
   792  	lowIndex := 0
   793  	highIndex := b.numFiniteBuckets
   794  	for {
   795  		pivotIndex := (highIndex + lowIndex) >> 1
   796  		lowerBound := b.lowerBounds[pivotIndex]
   797  		if sample < lowerBound {
   798  			highIndex = pivotIndex
   799  			continue
   800  		}
   801  		upperBound := b.lowerBounds[pivotIndex+1]
   802  		if sample >= upperBound {
   803  			lowIndex = pivotIndex
   804  			continue
   805  		}
   806  		return pivotIndex
   807  	}
   808  }
   809  
   810  // Verify that ExponentialBucketer implements Bucketer.
   811  var _ = (Bucketer)((*ExponentialBucketer)(nil))
   812  
   813  // DistributionMetric represents a distribution of values in finite buckets.
   814  // It also separately keeps track of min/max in order to ascertain whether the
   815  // buckets can faithfully represent the range of values encountered in the
   816  // distribution.
   817  type DistributionMetric struct {
   818  	// exponentialBucketer is the bucketing scheme used for this metric.
   819  	// Because we need DistributionMetric.AddSample to be go:nosplit-compatible,
   820  	// we cannot use an interface reference here, as we would not be able to call
   821  	// it in AddSample. Instead, we need one field per Bucketer implementation,
   822  	// and we call whichever one is in use in AddSample.
   823  	exponentialBucketer *ExponentialBucketer
   824  
   825  	// metadata is the metadata about this metric. It is immutable.
   826  	metadata *pb.MetricMetadata
   827  
   828  	// prometheusMetric describes the metric in Prometheus format. It is immutable.
   829  	prometheusMetric *prometheus.Metric
   830  
   831  	// fieldsToKey converts a multi-dimensional fields to a single string to use
   832  	// as key for `samples`.
   833  	fieldsToKey fieldMapper
   834  
   835  	// samples is the number of samples that fell within each bucket.
   836  	// It is mapped by the concatenation of the fields using `fieldsToKey`.
   837  	// The value is a list of bucket sample counts, with the 0-th being the
   838  	// "underflow bucket", i.e. the bucket of samples which cannot fall into
   839  	// any bucket that the bucketer supports.
   840  	// The i-th value is the number of samples that fell into the bucketer's
   841  	// (i-1)-th finite bucket.
   842  	// The last value is the number of samples that fell into the bucketer's
   843  	// last (i.e. infinite) bucket.
   844  	samples [][]atomicbitops.Uint64
   845  
   846  	// statistics is a set of statistics about each distribution.
   847  	// It is mapped by the concatenation of the fields using `fieldsToKey`.
   848  	statistics []distributionStatistics
   849  }
   850  
   851  // NewDistributionMetric creates and registers a new distribution metric.
   852  func NewDistributionMetric(name string, sync bool, bucketer Bucketer, unit pb.MetricMetadata_Units, description string, fields ...Field) (*DistributionMetric, error) {
   853  	if err := verifyName(name); err != nil {
   854  		return nil, err
   855  	}
   856  	if initialized.Load() {
   857  		return nil, ErrInitializationDone
   858  	}
   859  	if _, ok := allMetrics.uint64Metrics[name]; ok {
   860  		return nil, ErrNameInUse
   861  	}
   862  	if _, ok := allMetrics.distributionMetrics[name]; ok {
   863  		return nil, ErrNameInUse
   864  	}
   865  
   866  	var exponentialBucketer *ExponentialBucketer
   867  	if expBucketer, ok := bucketer.(*ExponentialBucketer); ok {
   868  		exponentialBucketer = expBucketer
   869  	} else {
   870  		return nil, fmt.Errorf("unsupported bucketer implementation: %T", bucketer)
   871  	}
   872  	fieldsToKey, err := newFieldMapper(fields...)
   873  	if err != nil {
   874  		return nil, err
   875  	}
   876  
   877  	numFiniteBuckets := bucketer.NumFiniteBuckets()
   878  	samples := fieldsToKey.makeDistributionSampleMap(numFiniteBuckets + 2)
   879  	protoFields := make([]*pb.MetricMetadata_Field, len(fields))
   880  	for i, f := range fields {
   881  		protoFields[i] = f.toProto()
   882  	}
   883  	lowerBounds := make([]int64, numFiniteBuckets+1)
   884  	for i := 0; i <= numFiniteBuckets; i++ {
   885  		lowerBounds[i] = bucketer.LowerBound(i)
   886  	}
   887  	allMetrics.distributionMetrics[name] = &DistributionMetric{
   888  		exponentialBucketer: exponentialBucketer,
   889  		fieldsToKey:         fieldsToKey,
   890  		samples:             samples,
   891  		statistics:          make([]distributionStatistics, fieldsToKey.numKeys()),
   892  		metadata: &pb.MetricMetadata{
   893  			Name:                          name,
   894  			PrometheusName:                nameToPrometheusName(name),
   895  			Description:                   description,
   896  			Cumulative:                    false,
   897  			Sync:                          sync,
   898  			Type:                          pb.MetricMetadata_TYPE_DISTRIBUTION,
   899  			Units:                         unit,
   900  			Fields:                        protoFields,
   901  			DistributionBucketLowerBounds: lowerBounds,
   902  		},
   903  		prometheusMetric: &prometheus.Metric{
   904  			Name: nameToPrometheusName(name),
   905  			Type: prometheus.TypeHistogram,
   906  			Help: description,
   907  		},
   908  	}
   909  	return allMetrics.distributionMetrics[name], nil
   910  }
   911  
   912  // MustCreateNewDistributionMetric creates and registers a distribution metric.
   913  // If an error occurs, it panics.
   914  func MustCreateNewDistributionMetric(name string, sync bool, bucketer Bucketer, unit pb.MetricMetadata_Units, description string, fields ...Field) *DistributionMetric {
   915  	distrib, err := NewDistributionMetric(name, sync, bucketer, unit, description, fields...)
   916  	if err != nil {
   917  		panic(err)
   918  	}
   919  	return distrib
   920  }
   921  
   922  // distributionStatistics is a set of useful statistics for a distribution.
   923  // As metric update operations must be non-blocking, this uses a bunch of
   924  // atomic numbers rather than a mutex.
   925  type distributionStatistics struct {
   926  	// sampleCount is the total number of samples.
   927  	sampleCount atomicbitops.Uint64
   928  
   929  	// sampleSum is the sum of samples.
   930  	sampleSum atomicbitops.Int64
   931  
   932  	// sumOfSquaredDeviations is the running sum of squared deviations from the
   933  	// mean of each sample.
   934  	// This quantity is useful as part of Welford's online algorithm:
   935  	// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
   936  	sumOfSquaredDeviations atomicbitops.Float64
   937  
   938  	// min and max are the minimum and maximum samples ever recorded.
   939  	min, max atomicbitops.Int64
   940  }
   941  
   942  // Update updates the distribution statistics with the given sample.
   943  // This function must be non-blocking, i.e. no mutexes.
   944  // As a result, it is not entirely accurate when it races with itself,
   945  // though the imprecision should be fairly small and should not practically
   946  // matter for distributions with more than a handful of records.
   947  func (s *distributionStatistics) Update(sample int64) {
   948  	newSampleCount := s.sampleCount.Add(1)
   949  	newSampleSum := s.sampleSum.Add(sample)
   950  
   951  	if newSampleCount > 1 {
   952  		// Not the first sample of the distribution.
   953  		floatSample := float64(sample)
   954  		oldMean := float64(newSampleSum-sample) / float64(newSampleCount-1)
   955  		newMean := float64(newSampleSum) / float64(newSampleCount)
   956  		devSquared := (floatSample - oldMean) * (floatSample - newMean)
   957  		s.sumOfSquaredDeviations.Add(devSquared)
   958  
   959  		// Update min and max.
   960  		// We optimistically load racily here in the hope that it passes the CaS
   961  		// operation. If it doesn't, we'll load it atomically, so this is not a
   962  		// race.
   963  		sync.RaceDisable()
   964  		for oldMin := s.min.RacyLoad(); sample < oldMin && !s.min.CompareAndSwap(oldMin, sample); oldMin = s.min.Load() {
   965  		}
   966  		for oldMax := s.max.RacyLoad(); sample > oldMax && !s.max.CompareAndSwap(oldMax, sample); oldMax = s.max.Load() {
   967  		}
   968  		sync.RaceEnable()
   969  	} else {
   970  		// We are the first sample, so set the min and max to the current sample.
   971  		// See above for why disabling race detection is safe here as well.
   972  		sync.RaceDisable()
   973  		if !s.min.CompareAndSwap(0, sample) {
   974  			for oldMin := s.min.RacyLoad(); sample < oldMin && !s.min.CompareAndSwap(oldMin, sample); oldMin = s.min.Load() {
   975  			}
   976  		}
   977  		if !s.max.CompareAndSwap(0, sample) {
   978  			for oldMax := s.max.RacyLoad(); sample > oldMax && !s.max.CompareAndSwap(oldMax, sample); oldMax = s.max.Load() {
   979  			}
   980  		}
   981  		sync.RaceEnable()
   982  	}
   983  }
   984  
   985  // distributionStatisticsSnapshot an atomically-loaded snapshot of
   986  // distributionStatistics.
   987  type distributionStatisticsSnapshot struct {
   988  	// sampleCount is the total number of samples.
   989  	sampleCount uint64
   990  
   991  	// sampleSum is the sum of samples.
   992  	sampleSum int64
   993  
   994  	// sumOfSquaredDeviations is the running sum of squared deviations from the
   995  	// mean of each sample.
   996  	// This quantity is useful as part of Welford's online algorithm:
   997  	// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
   998  	sumOfSquaredDeviations float64
   999  
  1000  	// min and max are the minimum and maximum samples ever recorded.
  1001  	min, max int64
  1002  }
  1003  
  1004  // Load generates a consistent snapshot of the distribution statistics.
  1005  func (s *distributionStatistics) Load() distributionStatisticsSnapshot {
  1006  	// We start out reading things racily, but will verify each of them
  1007  	// atomically later in this function, so this is OK. Disable the race
  1008  	// checker for this part of the function.
  1009  	sync.RaceDisable()
  1010  	snapshot := distributionStatisticsSnapshot{
  1011  		sampleCount:            s.sampleCount.RacyLoad(),
  1012  		sampleSum:              s.sampleSum.RacyLoad(),
  1013  		sumOfSquaredDeviations: s.sumOfSquaredDeviations.RacyLoad(),
  1014  		min:                    s.min.RacyLoad(),
  1015  		max:                    s.max.RacyLoad(),
  1016  	}
  1017  	sync.RaceEnable()
  1018  
  1019  	// Now verify that we loaded an atomic snapshot of the statistics.
  1020  	// This relies on the fact that each update should at least change the
  1021  	// count statistic, so we should be able to tell if anything changed based
  1022  	// on whether we have an exact match with the currently-loaded values.
  1023  	// If not, we reload that value and try again until all is consistent.
  1024  retry:
  1025  	if sampleCount := s.sampleCount.Load(); sampleCount != snapshot.sampleCount {
  1026  		snapshot.sampleCount = sampleCount
  1027  		goto retry
  1028  	}
  1029  	if sampleSum := s.sampleSum.Load(); sampleSum != snapshot.sampleSum {
  1030  		snapshot.sampleSum = sampleSum
  1031  		goto retry
  1032  	}
  1033  	if ssd := s.sumOfSquaredDeviations.Load(); ssd != snapshot.sumOfSquaredDeviations {
  1034  		snapshot.sumOfSquaredDeviations = ssd
  1035  		goto retry
  1036  	}
  1037  	if min := s.min.Load(); min != snapshot.min {
  1038  		snapshot.min = min
  1039  		goto retry
  1040  	}
  1041  	if max := s.max.Load(); max != snapshot.max {
  1042  		snapshot.max = max
  1043  		goto retry
  1044  	}
  1045  	return snapshot
  1046  }
  1047  
  1048  // AddSample adds a sample to the distribution.
  1049  // This *must* be called with the correct number of fields, or it will panic.
  1050  // +checkescape:all
  1051  //
  1052  //go:nosplit
  1053  func (d *DistributionMetric) AddSample(sample int64, fields ...*FieldValue) {
  1054  	d.addSampleByKey(sample, d.fieldsToKey.lookup(fields...))
  1055  }
  1056  
  1057  // addSampleByKey works like AddSample, with the field key already known.
  1058  // +checkescape:all
  1059  //
  1060  //go:nosplit
  1061  func (d *DistributionMetric) addSampleByKey(sample int64, key int) {
  1062  	bucket := d.exponentialBucketer.BucketIndex(sample)
  1063  	d.samples[key][bucket+1].Add(1)
  1064  	d.statistics[key].Update(sample)
  1065  }
  1066  
  1067  // Minimum number of buckets for NewDurationBucket.
  1068  const durationMinBuckets = 3
  1069  
  1070  // NewDurationBucketer returns a Bucketer well-suited for measuring durations in
  1071  // nanoseconds. Useful for NewTimerMetric.
  1072  // minDuration and maxDuration are conservative estimates of the minimum and
  1073  // maximum durations expected to be accurately measured by the Bucketer.
  1074  func NewDurationBucketer(numFiniteBuckets int, minDuration, maxDuration time.Duration) Bucketer {
  1075  	if numFiniteBuckets < durationMinBuckets {
  1076  		panic(fmt.Sprintf("duration bucketer must have at least %d buckets, got %d", durationMinBuckets, numFiniteBuckets))
  1077  	}
  1078  	minNs := minDuration.Nanoseconds()
  1079  	exponentCoversNs := float64(maxDuration.Nanoseconds()-int64(numFiniteBuckets-durationMinBuckets)*minNs) / float64(minNs)
  1080  	exponent := math.Log(exponentCoversNs) / math.Log(float64(numFiniteBuckets-durationMinBuckets))
  1081  	minNs = int64(float64(minNs) / exponent)
  1082  	return NewExponentialBucketer(numFiniteBuckets, uint64(minNs), float64(minNs), exponent)
  1083  }
  1084  
  1085  // TimerMetric wraps a distribution metric with convenience functions for
  1086  // latency measurements, which is a popular specialization of distribution
  1087  // metrics.
  1088  type TimerMetric struct {
  1089  	DistributionMetric
  1090  }
  1091  
  1092  // NewTimerMetric provides a convenient way to measure latencies.
  1093  // The arguments are the same as `NewDistributionMetric`, except:
  1094  //   - `nanoBucketer`: Same as `NewDistribution`'s `bucketer`, expected to hold
  1095  //     durations in nanoseconds. Adjust parameters accordingly.
  1096  //     NewDurationBucketer may be helpful here.
  1097  func NewTimerMetric(name string, nanoBucketer Bucketer, description string, fields ...Field) (*TimerMetric, error) {
  1098  	distrib, err := NewDistributionMetric(name, false, nanoBucketer, pb.MetricMetadata_UNITS_NANOSECONDS, description, fields...)
  1099  	if err != nil {
  1100  		return nil, err
  1101  	}
  1102  	return &TimerMetric{
  1103  		DistributionMetric: *distrib,
  1104  	}, nil
  1105  }
  1106  
  1107  // MustCreateNewTimerMetric creates and registers a timer metric.
  1108  // If an error occurs, it panics.
  1109  func MustCreateNewTimerMetric(name string, nanoBucketer Bucketer, description string, fields ...Field) *TimerMetric {
  1110  	timer, err := NewTimerMetric(name, nanoBucketer, description, fields...)
  1111  	if err != nil {
  1112  		panic(err)
  1113  	}
  1114  	return timer
  1115  }
  1116  
  1117  // TimedOperation is used by TimerMetric to keep track of the time elapsed
  1118  // between an operation starting and stopping.
  1119  type TimedOperation struct {
  1120  	// metric is a reference to the timer metric for the operation.
  1121  	metric *TimerMetric
  1122  
  1123  	// partialFields is a prefix of the fields used in this operation.
  1124  	// The rest of the fields is provided in TimedOperation.Finish.
  1125  	partialFields []*FieldValue
  1126  
  1127  	// startedNs is the number of nanoseconds measured in TimerMetric.Start().
  1128  	startedNs int64
  1129  }
  1130  
  1131  // Start starts a timer measurement for the given combination of fields.
  1132  // It returns a TimedOperation which can be passed around as necessary to
  1133  // measure the duration of the operation.
  1134  // Once the operation is finished, call Finish on the TimedOperation.
  1135  // The fields passed to Start may be partially specified; if so, the remaining
  1136  // fields must be passed to TimedOperation.Finish. This is useful for cases
  1137  // where which path an operation took is only known after it happens. This
  1138  // path can be part of the fields passed to Finish.
  1139  // +checkescape:all
  1140  //
  1141  //go:nosplit
  1142  func (t *TimerMetric) Start(fields ...*FieldValue) TimedOperation {
  1143  	return TimedOperation{
  1144  		metric:        t,
  1145  		partialFields: fields,
  1146  		startedNs:     CheapNowNano(),
  1147  	}
  1148  }
  1149  
  1150  // Finish marks an operation as finished and records its duration.
  1151  // `extraFields` is the rest of the fields appended to the fields passed to
  1152  // `TimerMetric.Start`. The concatenation of these two must be the exact
  1153  // number of fields that the underlying metric has.
  1154  // +checkescape:all
  1155  //
  1156  //go:nosplit
  1157  func (o TimedOperation) Finish(extraFields ...*FieldValue) {
  1158  	ended := CheapNowNano()
  1159  	fieldKey := o.metric.fieldsToKey.lookupConcat(o.partialFields, extraFields)
  1160  	o.metric.addSampleByKey(ended-o.startedNs, fieldKey)
  1161  }
  1162  
  1163  // stageTiming contains timing data for an initialization stage.
  1164  type stageTiming struct {
  1165  	stage   InitStage
  1166  	started time.Time
  1167  	// ended is the zero time when the stage has not ended yet.
  1168  	ended time.Time
  1169  }
  1170  
  1171  // inProgress returns whether this stage hasn't ended yet.
  1172  func (s stageTiming) inProgress() bool {
  1173  	return !s.started.IsZero() && s.ended.IsZero()
  1174  }
  1175  
  1176  // metricSet holds metric data.
  1177  type metricSet struct {
  1178  	// Metric registration data for all the metrics below.
  1179  	registration *pb.MetricRegistration
  1180  
  1181  	// Map of uint64 metrics.
  1182  	uint64Metrics map[string]customUint64Metric
  1183  
  1184  	// Map of distribution metrics.
  1185  	distributionMetrics map[string]*DistributionMetric
  1186  
  1187  	// mu protects the fields below.
  1188  	mu sync.RWMutex
  1189  
  1190  	// Information about the stages reached by the Sentry. Only appended to, so
  1191  	// reading a shallow copy of the slice header concurrently is safe.
  1192  	finished []stageTiming
  1193  
  1194  	// The current stage in progress.
  1195  	currentStage stageTiming
  1196  }
  1197  
  1198  // makeMetricSet returns a new metricSet.
  1199  func makeMetricSet() *metricSet {
  1200  	return &metricSet{
  1201  		uint64Metrics:       make(map[string]customUint64Metric),
  1202  		distributionMetrics: make(map[string]*DistributionMetric),
  1203  		finished:            make([]stageTiming, 0, len(allStages)),
  1204  	}
  1205  }
  1206  
  1207  // Values returns a snapshot of all values in m.
  1208  func (m *metricSet) Values() metricValues {
  1209  	m.mu.Lock()
  1210  	stages := m.finished[:]
  1211  	m.mu.Unlock()
  1212  
  1213  	vals := metricValues{
  1214  		uint64Metrics:            make(map[string]any, len(m.uint64Metrics)),
  1215  		distributionMetrics:      make(map[string][][]uint64, len(m.distributionMetrics)),
  1216  		distributionTotalSamples: make(map[string][]uint64, len(m.distributionMetrics)),
  1217  		distributionStatistics:   make(map[string][]distributionStatisticsSnapshot, len(m.distributionMetrics)),
  1218  		stages:                   stages,
  1219  	}
  1220  	for k, v := range m.uint64Metrics {
  1221  		fields := v.fields
  1222  		switch len(fields) {
  1223  		case 0:
  1224  			vals.uint64Metrics[k] = v.value()
  1225  		case 1:
  1226  			fieldsMap := make(map[*FieldValue]uint64)
  1227  			if v.forEachNonZero != nil {
  1228  				v.forEachNonZero(func(fieldValues []*FieldValue, val uint64) {
  1229  					fieldsMap[fieldValues[0]] = val
  1230  				})
  1231  			} else {
  1232  				for _, fieldValue := range fields[0].values {
  1233  					fieldsMap[fieldValue] = v.value(fieldValue)
  1234  				}
  1235  			}
  1236  			vals.uint64Metrics[k] = fieldsMap
  1237  		default:
  1238  			panic(fmt.Sprintf("Unsupported number of metric fields: %d", len(fields)))
  1239  		}
  1240  	}
  1241  	for name, metric := range m.distributionMetrics {
  1242  		fieldKeysToValues := make([][]uint64, len(metric.samples))
  1243  		fieldKeysToTotalSamples := make([]uint64, len(metric.samples))
  1244  		fieldKeysToStatistics := make([]distributionStatisticsSnapshot, len(metric.samples))
  1245  		for fieldKey, samples := range metric.samples {
  1246  			samplesSnapshot := snapshotDistribution(samples)
  1247  			totalSamples := uint64(0)
  1248  			for _, bucket := range samplesSnapshot {
  1249  				totalSamples += bucket
  1250  			}
  1251  			if totalSamples == 0 {
  1252  				// No samples recorded for this combination of field, so leave
  1253  				// the maps for this fieldKey as nil. This lessens the memory cost
  1254  				// of distributions with unused field combinations.
  1255  				fieldKeysToTotalSamples[fieldKey] = 0
  1256  				fieldKeysToStatistics[fieldKey] = distributionStatisticsSnapshot{}
  1257  				fieldKeysToValues[fieldKey] = nil
  1258  			} else {
  1259  				fieldKeysToTotalSamples[fieldKey] = totalSamples
  1260  				fieldKeysToStatistics[fieldKey] = metric.statistics[fieldKey].Load()
  1261  				fieldKeysToValues[fieldKey] = samplesSnapshot
  1262  			}
  1263  		}
  1264  		vals.distributionMetrics[name] = fieldKeysToValues
  1265  		vals.distributionTotalSamples[name] = fieldKeysToTotalSamples
  1266  		vals.distributionStatistics[name] = fieldKeysToStatistics
  1267  	}
  1268  	return vals
  1269  }
  1270  
  1271  // metricValues contains a copy of the values of all metrics.
  1272  type metricValues struct {
  1273  	// uint64Metrics is a map of uint64 metrics,
  1274  	// with key as metric name. Value can be either uint64, or map[*FieldValue]uint64
  1275  	// to support metrics with one field.
  1276  	uint64Metrics map[string]any
  1277  
  1278  	// distributionMetrics is a map of distribution metrics.
  1279  	// The first key level is the metric name.
  1280  	// The second key level is an index ID corresponding to the combination of
  1281  	// field values. The index is decoded to field strings using keyToMultiField.
  1282  	// The slice value is the number of samples in each bucket of the
  1283  	// distribution, with the first (0-th) element being the underflow bucket
  1284  	// and the last element being the "infinite" (overflow) bucket.
  1285  	// The slice value may also be nil for field combinations with no samples.
  1286  	// This saves memory by avoiding storing anything for unused field
  1287  	// combinations.
  1288  	distributionMetrics map[string][][]uint64
  1289  
  1290  	// distributionTotalSamples is the total number of samples for each
  1291  	// distribution metric and field values.
  1292  	// It allows performing a quick diff between snapshots without having to
  1293  	// iterate over all the buckets individually, so that distributions with
  1294  	// no new samples are not retransmitted.
  1295  	distributionTotalSamples map[string][]uint64
  1296  
  1297  	// distributionStatistics is a set of statistics about the samples.
  1298  	distributionStatistics map[string][]distributionStatisticsSnapshot
  1299  
  1300  	// Information on when initialization stages were reached. Does not include
  1301  	// the currently-ongoing stage, if any.
  1302  	stages []stageTiming
  1303  }
  1304  
  1305  var (
  1306  	// emitMu protects metricsAtLastEmit and ensures that all emitted
  1307  	// metrics are strongly ordered (older metrics are never emitted after
  1308  	// newer metrics).
  1309  	emitMu sync.Mutex
  1310  
  1311  	// metricsAtLastEmit contains the state of the metrics at the last emit event.
  1312  	metricsAtLastEmit metricValues
  1313  )
  1314  
  1315  // EmitMetricUpdate emits a MetricUpdate over the event channel.
  1316  //
  1317  // Only metrics that have changed since the last call are emitted.
  1318  //
  1319  // EmitMetricUpdate is thread-safe.
  1320  //
  1321  // Preconditions:
  1322  //   - Initialize has been called.
  1323  func EmitMetricUpdate() {
  1324  	emitMu.Lock()
  1325  	defer emitMu.Unlock()
  1326  
  1327  	snapshot := allMetrics.Values()
  1328  
  1329  	m := pb.MetricUpdate{}
  1330  	// On the first call metricsAtLastEmit will be empty. Include all
  1331  	// metrics then.
  1332  	for k, v := range snapshot.uint64Metrics {
  1333  		prev, ok := metricsAtLastEmit.uint64Metrics[k]
  1334  		switch t := v.(type) {
  1335  		case uint64:
  1336  			// Metric exists and value did not change.
  1337  			if ok && prev.(uint64) == t {
  1338  				continue
  1339  			}
  1340  
  1341  			m.Metrics = append(m.Metrics, &pb.MetricValue{
  1342  				Name:  k,
  1343  				Value: &pb.MetricValue_Uint64Value{Uint64Value: t},
  1344  			})
  1345  		case map[*FieldValue]uint64:
  1346  			for fieldValue, metricValue := range t {
  1347  				// Emit data on the first call only if the field
  1348  				// value has been incremented. For all other
  1349  				// calls, emit data if the field value has been
  1350  				// changed from the previous emit.
  1351  				if (!ok && metricValue == 0) || (ok && prev.(map[*FieldValue]uint64)[fieldValue] == metricValue) {
  1352  					continue
  1353  				}
  1354  
  1355  				m.Metrics = append(m.Metrics, &pb.MetricValue{
  1356  					Name:        k,
  1357  					FieldValues: []string{fieldValue.Value},
  1358  					Value:       &pb.MetricValue_Uint64Value{Uint64Value: metricValue},
  1359  				})
  1360  			}
  1361  		default:
  1362  			panic(fmt.Sprintf("unsupported type in uint64Metrics: %T (%v)", v, v))
  1363  		}
  1364  	}
  1365  	for name, dist := range snapshot.distributionTotalSamples {
  1366  		prev, ok := metricsAtLastEmit.distributionTotalSamples[name]
  1367  		for fieldKey, currentTotal := range dist {
  1368  			if currentTotal == 0 {
  1369  				continue
  1370  			}
  1371  			if ok {
  1372  				if prevTotal := prev[fieldKey]; prevTotal == currentTotal {
  1373  					continue
  1374  				}
  1375  			}
  1376  			oldSamples := metricsAtLastEmit.distributionMetrics[name]
  1377  			var newSamples []uint64
  1378  			if oldSamples != nil && oldSamples[fieldKey] != nil {
  1379  				currentSamples := snapshot.distributionMetrics[name][fieldKey]
  1380  				numBuckets := len(currentSamples)
  1381  				newSamples = make([]uint64, numBuckets)
  1382  				for i := 0; i < numBuckets; i++ {
  1383  					newSamples[i] = currentSamples[i] - oldSamples[fieldKey][i]
  1384  				}
  1385  			} else {
  1386  				// oldSamples == nil means that the previous snapshot has no samples.
  1387  				// This means the delta is the current number of samples, no need for
  1388  				// a copy.
  1389  				newSamples = snapshot.distributionMetrics[name][fieldKey]
  1390  			}
  1391  			m.Metrics = append(m.Metrics, &pb.MetricValue{
  1392  				Name:        name,
  1393  				FieldValues: allMetrics.distributionMetrics[name].fieldsToKey.keyToMultiField(fieldKey),
  1394  				Value: &pb.MetricValue_DistributionValue{
  1395  					DistributionValue: &pb.Samples{
  1396  						NewSamples: newSamples,
  1397  					},
  1398  				},
  1399  			})
  1400  		}
  1401  	}
  1402  
  1403  	for s := len(metricsAtLastEmit.stages); s < len(snapshot.stages); s++ {
  1404  		newStage := snapshot.stages[s]
  1405  		m.StageTiming = append(m.StageTiming, &pb.StageTiming{
  1406  			Stage: string(newStage.stage),
  1407  			Started: &timestamppb.Timestamp{
  1408  				Seconds: newStage.started.Unix(),
  1409  				Nanos:   int32(newStage.started.Nanosecond()),
  1410  			},
  1411  			Ended: &timestamppb.Timestamp{
  1412  				Seconds: newStage.ended.Unix(),
  1413  				Nanos:   int32(newStage.ended.Nanosecond()),
  1414  			},
  1415  		})
  1416  	}
  1417  
  1418  	metricsAtLastEmit = snapshot
  1419  	if len(m.Metrics) == 0 && len(m.StageTiming) == 0 {
  1420  		return
  1421  	}
  1422  
  1423  	if log.IsLogging(log.Debug) {
  1424  		sort.Slice(m.Metrics, func(i, j int) bool {
  1425  			return m.Metrics[i].GetName() < m.Metrics[j].GetName()
  1426  		})
  1427  		log.Debugf("Emitting metrics:")
  1428  		for _, metric := range m.Metrics {
  1429  			var valueStr string
  1430  			switch metric.GetValue().(type) {
  1431  			case *pb.MetricValue_Uint64Value:
  1432  				valueStr = fmt.Sprintf("%d", metric.GetUint64Value())
  1433  			case *pb.MetricValue_DistributionValue:
  1434  				valueStr = fmt.Sprintf("new distribution samples: %+v", metric.GetDistributionValue())
  1435  			default:
  1436  				valueStr = "unsupported type"
  1437  			}
  1438  			if len(metric.GetFieldValues()) > 0 {
  1439  				var foundMetadata *pb.MetricMetadata
  1440  				if metricObj, found := allMetrics.uint64Metrics[metric.GetName()]; found {
  1441  					foundMetadata = metricObj.metadata
  1442  				} else if metricObj, found := allMetrics.distributionMetrics[metric.GetName()]; found {
  1443  					foundMetadata = metricObj.metadata
  1444  				}
  1445  				if foundMetadata == nil || len(foundMetadata.GetFields()) != len(metric.GetFieldValues()) {
  1446  					// This should never happen, but if it somehow does, we don't want to crash here, as
  1447  					// this is debug output that may already be printed in the context of panic.
  1448  					log.Debugf("%s%v (cannot find metric definition!): %s", metric.GetName(), metric.GetFieldValues(), valueStr)
  1449  					continue
  1450  				}
  1451  				var sb strings.Builder
  1452  				for i, fieldValue := range metric.GetFieldValues() {
  1453  					if i > 0 {
  1454  						sb.WriteRune(',')
  1455  					}
  1456  					sb.WriteString(foundMetadata.GetFields()[i].GetFieldName())
  1457  					sb.WriteRune('=')
  1458  					sb.WriteString(fieldValue)
  1459  				}
  1460  				log.Debugf("  Metric %s[%s]: %s", metric.GetName(), sb.String(), valueStr)
  1461  			} else {
  1462  				log.Debugf("  Metric %s: %s", metric.GetName(), valueStr)
  1463  			}
  1464  		}
  1465  		for _, stage := range m.StageTiming {
  1466  			duration := time.Duration(stage.Ended.Seconds-stage.Started.Seconds)*time.Second + time.Duration(stage.Ended.Nanos-stage.Started.Nanos)*time.Nanosecond
  1467  			log.Debugf("Stage %s took %v", stage.GetStage(), duration)
  1468  		}
  1469  	}
  1470  
  1471  	if err := eventchannel.Emit(&m); err != nil {
  1472  		log.Warningf("Unable to emit metrics: %s", err)
  1473  	}
  1474  }
  1475  
  1476  // SnapshotOptions controls how snapshots are exported in GetSnapshot.
  1477  type SnapshotOptions struct {
  1478  	// Filter, if set, should return true for metrics that should be written to
  1479  	// the snapshot. If unset, all metrics are written to the snapshot.
  1480  	Filter func(*prometheus.Metric) bool
  1481  }
  1482  
  1483  // GetSnapshot returns a Prometheus snapshot of the metric data.
  1484  // Returns ErrNotYetInitialized if metrics have not yet been initialized.
  1485  func GetSnapshot(options SnapshotOptions) (*prometheus.Snapshot, error) {
  1486  	if !initialized.Load() {
  1487  		return nil, ErrNotYetInitialized
  1488  	}
  1489  	values := allMetrics.Values()
  1490  	snapshot := prometheus.NewSnapshot()
  1491  	for k, v := range values.uint64Metrics {
  1492  		m := allMetrics.uint64Metrics[k]
  1493  		if options.Filter != nil && !options.Filter(m.prometheusMetric) {
  1494  			continue
  1495  		}
  1496  		switch t := v.(type) {
  1497  		case uint64:
  1498  			if m.metadata.GetCumulative() && t == 0 {
  1499  				// Zero-valued counter, ignore.
  1500  				continue
  1501  			}
  1502  			snapshot.Add(prometheus.NewIntData(m.prometheusMetric, int64(t)))
  1503  		case map[*FieldValue]uint64:
  1504  			for fieldValue, metricValue := range t {
  1505  				if m.metadata.GetCumulative() && metricValue == 0 {
  1506  					// Zero-valued counter, ignore.
  1507  					continue
  1508  				}
  1509  				snapshot.Add(prometheus.LabeledIntData(m.prometheusMetric, map[string]string{
  1510  					// uint64 metrics currently only support at most one field name.
  1511  					m.metadata.Fields[0].GetFieldName(): fieldValue.Value,
  1512  				}, int64(metricValue)))
  1513  			}
  1514  		default:
  1515  			panic(fmt.Sprintf("unsupported type in uint64Metrics: %T (%v)", v, v))
  1516  		}
  1517  	}
  1518  	for k, dists := range values.distributionTotalSamples {
  1519  		m := allMetrics.distributionMetrics[k]
  1520  		if options.Filter != nil && !options.Filter(m.prometheusMetric) {
  1521  			continue
  1522  		}
  1523  		distributionSamples := values.distributionMetrics[k]
  1524  		numFiniteBuckets := m.exponentialBucketer.NumFiniteBuckets()
  1525  		statistics := values.distributionStatistics[k]
  1526  		for fieldKey := range dists {
  1527  			var labels map[string]string
  1528  			if numFields := m.fieldsToKey.numKeys(); numFields > 0 {
  1529  				labels = make(map[string]string, numFields)
  1530  				for fieldIndex, field := range m.fieldsToKey.keyToMultiField(fieldKey) {
  1531  					labels[m.metadata.Fields[fieldIndex].GetFieldName()] = field
  1532  				}
  1533  			}
  1534  			currentSamples := distributionSamples[fieldKey]
  1535  			buckets := make([]prometheus.Bucket, numFiniteBuckets+2)
  1536  			samplesForFieldKey := uint64(0)
  1537  			for b := 0; b < numFiniteBuckets+2; b++ {
  1538  				var upperBound prometheus.Number
  1539  				if b == numFiniteBuckets+1 {
  1540  					upperBound = prometheus.Number{Float: math.Inf(1)} // Overflow bucket.
  1541  				} else {
  1542  					upperBound = prometheus.Number{Int: m.exponentialBucketer.LowerBound(b)}
  1543  				}
  1544  				samples := uint64(0)
  1545  				if currentSamples != nil {
  1546  					samples = currentSamples[b]
  1547  					samplesForFieldKey += samples
  1548  				}
  1549  				buckets[b] = prometheus.Bucket{
  1550  					Samples:    samples,
  1551  					UpperBound: upperBound,
  1552  				}
  1553  			}
  1554  			if samplesForFieldKey == 0 {
  1555  				// Zero-valued distribution (no samples in any bucket for this field
  1556  				// combination). Ignore.
  1557  				continue
  1558  			}
  1559  			snapshot.Add(&prometheus.Data{
  1560  				Metric: m.prometheusMetric,
  1561  				Labels: labels,
  1562  				HistogramValue: &prometheus.Histogram{
  1563  					Total:                  prometheus.Number{Int: statistics[fieldKey].sampleSum},
  1564  					SumOfSquaredDeviations: prometheus.Number{Float: statistics[fieldKey].sumOfSquaredDeviations},
  1565  					Min:                    prometheus.Number{Int: statistics[fieldKey].min},
  1566  					Max:                    prometheus.Number{Int: statistics[fieldKey].max},
  1567  					Buckets:                buckets,
  1568  				},
  1569  			})
  1570  		}
  1571  	}
  1572  	return snapshot, nil
  1573  }
  1574  
  1575  // StartStage should be called when an initialization stage is started.
  1576  // It returns a function that must be called to indicate that the stage ended.
  1577  // Alternatively, future calls to StartStage will implicitly indicate that the
  1578  // previous stage ended.
  1579  // Stage information will be emitted in the next call to EmitMetricUpdate after
  1580  // a stage has ended.
  1581  //
  1582  // This function may (and is expected to) be called prior to final
  1583  // initialization of this metric library, as it has to capture early stages
  1584  // of Sentry initialization.
  1585  func StartStage(stage InitStage) func() {
  1586  	now := time.Now()
  1587  	allMetrics.mu.Lock()
  1588  	defer allMetrics.mu.Unlock()
  1589  	if allMetrics.currentStage.inProgress() {
  1590  		endStage(now)
  1591  	}
  1592  	allMetrics.currentStage.stage = stage
  1593  	allMetrics.currentStage.started = now
  1594  	return func() {
  1595  		now := time.Now()
  1596  		allMetrics.mu.Lock()
  1597  		defer allMetrics.mu.Unlock()
  1598  		// The current stage may have been ended by another call to StartStage, so
  1599  		// double-check prior to clearing the current stage.
  1600  		if allMetrics.currentStage.inProgress() && allMetrics.currentStage.stage == stage {
  1601  			endStage(now)
  1602  		}
  1603  	}
  1604  }
  1605  
  1606  // endStage marks allMetrics.currentStage as ended, adding it to the list of
  1607  // finished stages. It assumes allMetrics.mu is locked.
  1608  func endStage(when time.Time) {
  1609  	allMetrics.currentStage.ended = when
  1610  	allMetrics.finished = append(allMetrics.finished, allMetrics.currentStage)
  1611  	allMetrics.currentStage = stageTiming{}
  1612  }