github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/metric/metric.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package metric provides primitives for collecting metrics. 16 package metric 17 18 import ( 19 "errors" 20 "fmt" 21 "math" 22 re "regexp" 23 "sort" 24 "strings" 25 "time" 26 27 "google.golang.org/protobuf/types/known/timestamppb" 28 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 29 "github.com/nicocha30/gvisor-ligolo/pkg/eventchannel" 30 "github.com/nicocha30/gvisor-ligolo/pkg/log" 31 pb "github.com/nicocha30/gvisor-ligolo/pkg/metric/metric_go_proto" 32 "github.com/nicocha30/gvisor-ligolo/pkg/prometheus" 33 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 34 ) 35 36 var ( 37 // ErrNameInUse indicates that another metric is already defined for 38 // the given name. 39 ErrNameInUse = errors.New("metric name already in use") 40 41 // ErrInitializationDone indicates that the caller tried to create a 42 // new metric after initialization. 43 ErrInitializationDone = errors.New("metric cannot be created after initialization is complete") 44 45 // ErrFieldValueContainsIllegalChar indicates that the value of a metric 46 // field had an invalid character in it. 47 ErrFieldValueContainsIllegalChar = errors.New("metric field value contains illegal character") 48 49 // ErrFieldHasNoAllowedValues indicates that the field needs to define some 50 // allowed values to be a valid and useful field. 51 ErrFieldHasNoAllowedValues = errors.New("metric field does not define any allowed values") 52 53 // ErrTooManyFieldCombinations indicates that the number of unique 54 // combinations of fields is too large to support. 55 ErrTooManyFieldCombinations = errors.New("metric has too many combinations of allowed field values") 56 ) 57 58 // Weirdness metric type constants. 59 var ( 60 WeirdnessTypeTimeFallback = FieldValue{"time_fallback"} 61 WeirdnessTypePartialResult = FieldValue{"partial_result"} 62 WeirdnessTypeVsyscallCount = FieldValue{"vsyscall_count"} 63 WeirdnessTypeWatchdogStuckStartup = FieldValue{"watchdog_stuck_startup"} 64 WeirdnessTypeWatchdogStuckTasks = FieldValue{"watchdog_stuck_tasks"} 65 ) 66 67 // Suspicious operations metric type constants. 68 var ( 69 SuspiciousOperationsTypeOpenedWriteExecuteFile = FieldValue{"opened_write_execute_file"} 70 ) 71 72 // List of global metrics that are used in multiple places. 73 var ( 74 // WeirdnessMetric is a metric with fields created to track the number 75 // of weird occurrences such as time fallback, partial_result, vsyscall 76 // count, watchdog startup timeouts and stuck tasks. 77 WeirdnessMetric = MustCreateNewUint64Metric("/weirdness", true /* sync */, "Increment for weird occurrences of problems such as time fallback, partial result, vsyscalls invoked in the sandbox, watchdog startup timeouts and stuck tasks.", 78 NewField("weirdness_type", 79 &WeirdnessTypeTimeFallback, 80 &WeirdnessTypePartialResult, 81 &WeirdnessTypeVsyscallCount, 82 &WeirdnessTypeWatchdogStuckStartup, 83 &WeirdnessTypeWatchdogStuckTasks, 84 )) 85 86 // SuspiciousOperationsMetric is a metric with fields created to detect 87 // operations such as opening an executable file to write from a gofer. 88 SuspiciousOperationsMetric = MustCreateNewUint64Metric("/suspicious_operations", true /* sync */, "Increment for suspicious operations such as opening an executable file to write from a gofer.", 89 NewField("operation_type", 90 &SuspiciousOperationsTypeOpenedWriteExecuteFile, 91 )) 92 ) 93 94 // InitStage is the name of a Sentry initialization stage. 95 type InitStage string 96 97 // List of all Sentry initialization stages. 98 var ( 99 InitRestoreConfig InitStage = "restore_config" 100 InitExecConfig InitStage = "exec_config" 101 InitRestore InitStage = "restore" 102 InitCreateProcess InitStage = "create_process" 103 InitTaskStart InitStage = "task_start" 104 105 // allStages is the list of allowed stages. 106 allStages = []InitStage{ 107 InitRestoreConfig, 108 InitExecConfig, 109 InitRestore, 110 InitCreateProcess, 111 InitTaskStart, 112 } 113 ) 114 115 // Uint64Metric encapsulates a uint64 that represents some kind of metric to be 116 // monitored. 117 // 118 // Metrics are not saved across save/restore and thus reset to zero on restore. 119 type Uint64Metric struct { 120 name string 121 122 // fields is the map of field-value combination index keys to Uint64 counters. 123 fields []atomicbitops.Uint64 124 125 // fieldMapper is used to generate index keys for the fields array (above) 126 // based on field value combinations, and vice-versa. 127 fieldMapper fieldMapper 128 } 129 130 var ( 131 // initialized indicates that all metrics are registered. allMetrics is 132 // immutable once initialized is true. 133 initialized atomicbitops.Bool 134 135 // allMetrics are the registered metrics. 136 allMetrics = makeMetricSet() 137 ) 138 139 // Initialize sends a metric registration event over the event channel. 140 // 141 // Precondition: 142 // - All metrics are registered. 143 // - Initialize/Disable has not been called. 144 func Initialize() error { 145 if initialized.Load() { 146 return errors.New("metric.Initialize called after metric.Initialize or metric.Disable") 147 } 148 149 m := pb.MetricRegistration{} 150 for _, v := range allMetrics.uint64Metrics { 151 m.Metrics = append(m.Metrics, v.metadata) 152 } 153 for _, v := range allMetrics.distributionMetrics { 154 m.Metrics = append(m.Metrics, v.metadata) 155 } 156 m.Stages = make([]string, 0, len(allStages)) 157 for _, s := range allStages { 158 m.Stages = append(m.Stages, string(s)) 159 } 160 allMetrics.registration = &m 161 if err := eventchannel.Emit(&m); err != nil { 162 return fmt.Errorf("unable to emit metric initialize event: %w", err) 163 } 164 165 if initialized.Swap(true) { 166 return errors.New("raced with another call to metric.Initialize or metric.Disable") 167 } 168 return nil 169 } 170 171 // ErrNotYetInitialized is returned by GetMetricRegistration if metrics are not yet initialized. 172 var ErrNotYetInitialized = errors.New("metrics are not yet initialized") 173 174 // GetMetricRegistration returns the metric registration data for all registered metrics. 175 // Must be called after Initialize(). 176 // Returns ErrNotYetInitialized if metrics are not yet initialized. 177 func GetMetricRegistration() (*pb.MetricRegistration, error) { 178 if !initialized.Load() { 179 return nil, ErrNotYetInitialized 180 } 181 if allMetrics.registration == nil { 182 return nil, errors.New("metrics are disabled") 183 } 184 return allMetrics.registration, nil 185 } 186 187 // Disable sends an empty metric registration event over the event channel, 188 // disabling metric collection. 189 // 190 // Precondition: 191 // - All metrics are registered. 192 // - Initialize/Disable has not been called. 193 func Disable() error { 194 if initialized.Load() { 195 return errors.New("metric.Disable called after metric.Initialize or metric.Disable") 196 } 197 198 m := pb.MetricRegistration{} 199 if err := eventchannel.Emit(&m); err != nil { 200 return fmt.Errorf("unable to emit empty metric registration event (metrics disabled): %w", err) 201 } 202 203 if initialized.Swap(true) { 204 return errors.New("raced with another call to metric.Initialize or metric.Disable") 205 } 206 return nil 207 } 208 209 type customUint64Metric struct { 210 // metadata describes the metric. It is immutable. 211 metadata *pb.MetricMetadata 212 213 // prometheusMetric describes the metric in Prometheus format. It is immutable. 214 prometheusMetric *prometheus.Metric 215 216 // fields is the set of fields of the metric. 217 fields []Field 218 219 // value returns the current value of the metric for the given set of 220 // fields. It takes a variadic number of field values as argument. 221 value func(fieldValues ...*FieldValue) uint64 222 223 // forEachNonZero calls the given function on each possible field value of 224 // the metric where the metric's value is non-zero. 225 // The passed-in function should not allocate new memory, and may not save 226 // or modify `fields` directly, as the slice memory is reused across calls. 227 // `forEachNonZero` does not guarantee that it will be called on a 228 // consistent snapshot of this metric's values. 229 // `forEachNonZero` may be nil. 230 forEachNonZero func(f func(fields []*FieldValue, val uint64)) 231 } 232 233 // FieldValue is a string that can be used as a value for a Field. 234 // It must be referred to by address when the Field is created and when its 235 // metric value is modified. This ensures that the same FieldValue reference 236 // is used, which in turn enables the metric code to use the address of a 237 // FieldValue as comparison operator, rather than doing string comparisons. 238 type FieldValue struct { 239 Value string 240 } 241 242 // fieldMapperMapThreshold is the number of field values after which we switch 243 // to using map lookups when looking up field values. 244 // This value was determined using benchmarks to see which is fastest. 245 const fieldMapperMapThreshold = 48 246 247 // Field contains the field name and allowed values for the metric which is 248 // used in registration of the metric. 249 type Field struct { 250 // name is the metric field name. 251 name string 252 253 // values is the list of values for the field. 254 // `values` is always populated but not always used for lookup. It depends 255 // on the number of allowed field values. `values` is used for lookups on 256 // fields with small numbers of field values. 257 values []*FieldValue 258 259 // valuesPtrMap is a map version of `values`. For each item in `values`, 260 // its pointer is mapped to its index within `values`. 261 // `valuesPtrMap` is used for fields with large numbers of possible values. 262 // For fields with small numbers of field values, it is nil. 263 // This map allows doing faster string matching than a normal string map, 264 // as it avoids the string hashing step that normal string maps need to do. 265 valuesPtrMap map[*FieldValue]int 266 } 267 268 // toProto returns the proto definition of this field, for use in metric 269 // metadata. 270 func (f Field) toProto() *pb.MetricMetadata_Field { 271 allowedValues := make([]string, len(f.values)) 272 for i, v := range f.values { 273 allowedValues[i] = v.Value 274 } 275 return &pb.MetricMetadata_Field{ 276 FieldName: f.name, 277 AllowedValues: allowedValues, 278 } 279 } 280 281 // NewField defines a new Field that can be used to break down a metric. 282 // The set of allowedValues must be unique strings wrapped with `FieldValue`. 283 // The *same* `FieldValue` pointers must be used during metric modifications. 284 // In practice, in most cases, this means you should declare these 285 // `FieldValue`s as package-level `var`s, and always use the address of these 286 // package-level `var`s during metric modifications. 287 func NewField(name string, allowedValues ...*FieldValue) Field { 288 // Verify that all string values have a unique value. 289 strMap := make(map[string]bool, len(allowedValues)) 290 for _, v := range allowedValues { 291 if strMap[v.Value] { 292 panic(fmt.Sprintf("found duplicate field value: %q", v)) 293 } 294 strMap[v.Value] = true 295 } 296 297 if useMap := len(allowedValues) > fieldMapperMapThreshold; !useMap { 298 return Field{ 299 name: name, 300 values: allowedValues, 301 } 302 } 303 304 valuesPtrMap := make(map[*FieldValue]int, len(allowedValues)) 305 for i, v := range allowedValues { 306 valuesPtrMap[v] = i 307 } 308 return Field{ 309 name: name, 310 values: allowedValues, 311 valuesPtrMap: valuesPtrMap, 312 } 313 } 314 315 // fieldMapper provides multi-dimensional fields to a single unique integer key 316 type fieldMapper struct { 317 // fields is a list of Field objects, which importantly include individual 318 // Field names which are used to perform the keyToMultiField function; and 319 // allowedValues for each field type which are used to perform the lookup 320 // function. 321 fields []Field 322 323 // numFieldCombinations is the number of unique keys for all possible field 324 // combinations. 325 numFieldCombinations int 326 } 327 328 // newFieldMapper returns a new fieldMapper for the given set of fields. 329 func newFieldMapper(fields ...Field) (fieldMapper, error) { 330 numFieldCombinations := 1 331 for _, f := range fields { 332 // Disallow fields with no possible values. We could also ignore them 333 // instead, but passing in a no-allowed-values field is probably a mistake. 334 if len(f.values) == 0 { 335 return fieldMapper{nil, 0}, ErrFieldHasNoAllowedValues 336 } 337 numFieldCombinations *= len(f.values) 338 339 // Sanity check, could be useful in case someone dynamically generates too 340 // many fields accidentally. 341 if numFieldCombinations > math.MaxUint32 || numFieldCombinations < 0 { 342 return fieldMapper{nil, 0}, ErrTooManyFieldCombinations 343 } 344 } 345 346 return fieldMapper{ 347 fields: fields, 348 numFieldCombinations: numFieldCombinations, 349 }, nil 350 } 351 352 // lookupSingle looks up a single key for a single field within fieldMapper. 353 // It is used internally within lookupConcat. 354 // It returns the updated `idx` and `remainingCombinationBucket` values. 355 // +checkescape:all 356 // 357 //go:nosplit 358 func (m fieldMapper) lookupSingle(fieldIndex int, fieldValue *FieldValue, idx, remainingCombinationBucket int) (int, int) { 359 field := m.fields[fieldIndex] 360 numValues := len(field.values) 361 362 // Are we doing a linear search? 363 if field.valuesPtrMap == nil { 364 // We scan by pointers only. This means the caller must pass the same 365 // FieldValue pointer as the one used in `NewField`. 366 for valIdx, allowedVal := range field.values { 367 if fieldValue == allowedVal { 368 remainingCombinationBucket /= numValues 369 idx += remainingCombinationBucket * valIdx 370 return idx, remainingCombinationBucket 371 } 372 } 373 panic("invalid field value or did not reuse the same FieldValue pointer as passed in NewField") 374 } 375 376 // Use map lookup instead. 377 378 // Match using FieldValue pointer. 379 // This avoids the string hashing step that string maps otherwise do. 380 valIdx, found := field.valuesPtrMap[fieldValue] 381 if found { 382 remainingCombinationBucket /= numValues 383 idx += remainingCombinationBucket * valIdx 384 return idx, remainingCombinationBucket 385 } 386 387 panic("invalid field value or did not reuse the same FieldValue pointer as passed in NewField") 388 } 389 390 // lookupConcat looks up a key within the fieldMapper where the fields are 391 // the concatenation of two list of fields. 392 // The returned key is an index that can be used to access to map created by 393 // makeMap(). 394 // This *must* be called with the correct number of fields, or it will panic. 395 // +checkescape:all 396 // 397 //go:nosplit 398 func (m fieldMapper) lookupConcat(fields1, fields2 []*FieldValue) int { 399 if (len(fields1) + len(fields2)) != len(m.fields) { 400 panic("invalid field lookup depth") 401 } 402 idx := 0 403 remainingCombinationBucket := m.numFieldCombinations 404 for i, val := range fields1 { 405 idx, remainingCombinationBucket = m.lookupSingle(i, val, idx, remainingCombinationBucket) 406 } 407 408 numFields1 := len(fields1) 409 for i, val := range fields2 { 410 idx, remainingCombinationBucket = m.lookupSingle(i+numFields1, val, idx, remainingCombinationBucket) 411 } 412 413 return idx 414 } 415 416 // lookup looks up a key within the fieldMapper. 417 // The returned key is an index that can be used to access to map created by 418 // makeMap(). 419 // This *must* be called with the correct number of fields, or it will panic. 420 // +checkescape:all 421 // 422 //go:nosplit 423 func (m fieldMapper) lookup(fields ...*FieldValue) int { 424 return m.lookupConcat(fields, nil) 425 } 426 427 // numKeys returns the total number of key-to-field-combinations mappings 428 // defined by the fieldMapper. 429 // 430 //go:nosplit 431 func (m fieldMapper) numKeys() int { 432 return m.numFieldCombinations 433 } 434 435 // makeDistributionSampleMap creates a two dimensional array, where: 436 // - The first level corresponds to unique field value combinations and is 437 // accessed using index "keys" made by fieldMapper. 438 // - The second level corresponds to buckets within a metric. The number of 439 // buckets is specified by numBuckets. 440 func (m fieldMapper) makeDistributionSampleMap(numBuckets int) [][]atomicbitops.Uint64 { 441 samples := make([][]atomicbitops.Uint64, m.numKeys()) 442 for i := range samples { 443 samples[i] = make([]atomicbitops.Uint64, numBuckets) 444 } 445 return samples 446 } 447 448 // keyToMultiField is the reverse of lookup/lookupConcat. The returned list of 449 // field values corresponds to the same order of fields that were passed in to 450 // newFieldMapper. 451 func (m fieldMapper) keyToMultiField(key int) []string { 452 depth := len(m.fields) 453 if depth == 0 && key == 0 { 454 return nil 455 } 456 fieldValues := make([]string, depth) 457 remainingCombinationBucket := m.numFieldCombinations 458 for i := 0; i < depth; i++ { 459 remainingCombinationBucket /= len(m.fields[i].values) 460 fieldValues[i] = m.fields[i].values[key/remainingCombinationBucket].Value 461 key = key % remainingCombinationBucket 462 } 463 return fieldValues 464 } 465 466 // keyToMultiFieldInPlace does the operation described in `keyToMultiField` 467 // but modifies `fieldValues` in-place. It must already be of size 468 // `len(m.fields)`. 469 // 470 //go:nosplit 471 func (m fieldMapper) keyToMultiFieldInPlace(key int, fieldValues []*FieldValue) { 472 if len(m.fields) == 0 { 473 return 474 } 475 depth := len(m.fields) 476 remainingCombinationBucket := m.numFieldCombinations 477 for i := 0; i < depth; i++ { 478 remainingCombinationBucket /= len(m.fields[i].values) 479 fieldValues[i] = m.fields[i].values[key/remainingCombinationBucket] 480 key = key % remainingCombinationBucket 481 } 482 } 483 484 // nameToPrometheusName transforms a path-style metric name (/foo/bar) into a Prometheus-style 485 // metric name (foo_bar). 486 func nameToPrometheusName(name string) string { 487 return strings.ReplaceAll(strings.TrimPrefix(name, "/"), "/", "_") 488 } 489 490 var validMetricNameRegexp = re.MustCompile("^(?:/[_\\w]+)+$") 491 492 // verifyName verifies that the given metric name is a valid path-style metric 493 // name. 494 func verifyName(name string) error { 495 if !strings.HasPrefix(name, "/") { 496 return fmt.Errorf("metric name must start with a '/': %q", name) 497 } 498 if !validMetricNameRegexp.MatchString(name) { 499 return fmt.Errorf("invalid metric name: %q", name) 500 } 501 return nil 502 } 503 504 // RegisterCustomUint64Metric registers a metric with the given name. 505 // 506 // Register must only be called at init and will return and error if called 507 // after Initialized. 508 // 509 // Preconditions: 510 // - name must be globally unique. 511 // - Initialize/Disable have not been called. 512 // - value is expected to accept exactly len(fields) arguments. 513 func RegisterCustomUint64Metric(name string, cumulative, sync bool, units pb.MetricMetadata_Units, description string, value func(...*FieldValue) uint64, fields ...Field) error { 514 if initialized.Load() { 515 return ErrInitializationDone 516 } 517 518 if _, ok := allMetrics.uint64Metrics[name]; ok { 519 return ErrNameInUse 520 } 521 if _, ok := allMetrics.distributionMetrics[name]; ok { 522 return ErrNameInUse 523 } 524 525 promType := prometheus.TypeGauge 526 if cumulative { 527 promType = prometheus.TypeCounter 528 } 529 530 allMetrics.uint64Metrics[name] = customUint64Metric{ 531 metadata: &pb.MetricMetadata{ 532 Name: name, 533 PrometheusName: nameToPrometheusName(name), 534 Description: description, 535 Cumulative: cumulative, 536 Sync: sync, 537 Type: pb.MetricMetadata_TYPE_UINT64, 538 Units: units, 539 }, 540 prometheusMetric: &prometheus.Metric{ 541 Name: nameToPrometheusName(name), 542 Help: description, 543 Type: promType, 544 }, 545 fields: fields, 546 value: value, 547 } 548 549 // Metrics can exist without fields. 550 if l := len(fields); l > 1 { 551 return fmt.Errorf("%d fields provided, must be <= 1", l) 552 } 553 554 for _, field := range fields { 555 allMetrics.uint64Metrics[name].metadata.Fields = append(allMetrics.uint64Metrics[name].metadata.Fields, field.toProto()) 556 } 557 return nil 558 } 559 560 // MustRegisterCustomUint64Metric calls RegisterCustomUint64Metric for metrics 561 // without fields and panics if it returns an error. 562 func MustRegisterCustomUint64Metric(name string, cumulative, sync bool, description string, value func(...*FieldValue) uint64, fields ...Field) { 563 if err := RegisterCustomUint64Metric(name, cumulative, sync, pb.MetricMetadata_UNITS_NONE, description, value, fields...); err != nil { 564 panic(fmt.Sprintf("Unable to register metric %q: %s", name, err)) 565 } 566 } 567 568 // NewUint64Metric creates and registers a new cumulative metric with the given 569 // name. 570 // 571 // Metrics must be statically defined (i.e., at init). 572 func NewUint64Metric(name string, sync bool, units pb.MetricMetadata_Units, description string, fields ...Field) (*Uint64Metric, error) { 573 if err := verifyName(name); err != nil { 574 return nil, err 575 } 576 f, err := newFieldMapper(fields...) 577 if err != nil { 578 return nil, err 579 } 580 m := Uint64Metric{ 581 name: name, 582 fieldMapper: f, 583 fields: make([]atomicbitops.Uint64, f.numKeys()), 584 } 585 if err := RegisterCustomUint64Metric(name, true /* cumulative */, sync, units, description, m.Value, fields...); err != nil { 586 return nil, err 587 } 588 cm := allMetrics.uint64Metrics[name] 589 cm.forEachNonZero = m.forEachNonZero 590 allMetrics.uint64Metrics[name] = cm 591 return &m, nil 592 } 593 594 // MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns 595 // an error. 596 func MustCreateNewUint64Metric(name string, sync bool, description string, fields ...Field) *Uint64Metric { 597 m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NONE, description, fields...) 598 if err != nil { 599 panic(fmt.Sprintf("Unable to create metric %q: %s", name, err)) 600 } 601 return m 602 } 603 604 // MustCreateNewUint64NanosecondsMetric calls NewUint64Metric and panics if it 605 // returns an error. 606 func MustCreateNewUint64NanosecondsMetric(name string, sync bool, description string) *Uint64Metric { 607 m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NANOSECONDS, description) 608 if err != nil { 609 panic(fmt.Sprintf("Unable to create metric %q: %s", name, err)) 610 } 611 return m 612 } 613 614 // Value returns the current value of the metric for the given set of fields. 615 // This must be called with the correct number of field values or it will panic. 616 // 617 //go:nosplit 618 func (m *Uint64Metric) Value(fieldValues ...*FieldValue) uint64 { 619 key := m.fieldMapper.lookupConcat(fieldValues, nil) 620 return m.fields[key].Load() 621 } 622 623 // forEachNonZero iterates over each field combination and calls the given 624 // function whenever this metric's value is not zero. 625 func (m *Uint64Metric) forEachNonZero(f func(fieldValues []*FieldValue, value uint64)) { 626 numCombinations := m.fieldMapper.numKeys() 627 if len(m.fieldMapper.fields) == 0 { 628 // Special-case the "there are no fields" case for speed and to avoid 629 // allocating a slice. 630 if val := m.fields[0].Load(); val != 0 { 631 f(nil, val) 632 } 633 return 634 } 635 var fieldValues []*FieldValue 636 for k := 0; k < numCombinations; k++ { 637 val := m.fields[k].Load() 638 if val == 0 { 639 continue 640 } 641 if fieldValues == nil { 642 fieldValues = make([]*FieldValue, len(m.fieldMapper.fields)) 643 } 644 m.fieldMapper.keyToMultiFieldInPlace(k, fieldValues) 645 f(fieldValues, val) 646 } 647 } 648 649 // Increment increments the metric field by 1. 650 // This must be called with the correct number of field values or it will panic. 651 // 652 //go:nosplit 653 func (m *Uint64Metric) Increment(fieldValues ...*FieldValue) { 654 key := m.fieldMapper.lookupConcat(fieldValues, nil) 655 m.fields[key].Add(1) 656 } 657 658 // IncrementBy increments the metric by v. 659 // This must be called with the correct number of field values or it will panic. 660 // 661 //go:nosplit 662 func (m *Uint64Metric) IncrementBy(v uint64, fieldValues ...*FieldValue) { 663 key := m.fieldMapper.lookupConcat(fieldValues, nil) 664 m.fields[key].Add(v) 665 } 666 667 // Bucketer is an interface to bucket values into finite, distinct buckets. 668 type Bucketer interface { 669 // NumFiniteBuckets is the number of finite buckets in the distribution. 670 // This is only called once and never expected to return a different value. 671 NumFiniteBuckets() int 672 673 // LowerBound takes the index of a bucket (within [0, NumBuckets()]) and 674 // returns the inclusive lower bound of that bucket. 675 // In other words, the lowest value of `x` for which `BucketIndex(x) == i` 676 // should be `x = LowerBound(i)`. 677 // The upper bound of a bucket is the lower bound of the next bucket. 678 // The last bucket (with `bucketIndex == NumFiniteBuckets()`) is infinite, 679 // i.e. it has no upper bound (but it still has a lower bound). 680 LowerBound(bucketIndex int) int64 681 682 // BucketIndex takes a sample and returns the index of the bucket that the 683 // sample should fall into. 684 // Must return either: 685 // - A value within [0, NumBuckets() -1] if the sample falls within a 686 // finite bucket 687 // - NumBuckets() if the sample falls within the last (infinite) bucket 688 // - '-1' if the sample is lower than what any bucket can represent, i.e. 689 // the sample should be in the implicit "underflow" bucket. 690 // This function must be go:nosplit-compatible and have no escapes. 691 // +checkescape:all 692 BucketIndex(sample int64) int 693 } 694 695 // ExponentialBucketer implements Bucketer, with the first bucket starting 696 // with 0 as lowest bound with `Width` width, and each subsequent bucket being 697 // wider by a scaled exponentially-growing series, until `NumFiniteBuckets` 698 // buckets exist. 699 type ExponentialBucketer struct { 700 // numFinitebuckets is the total number of finite buckets in the scheme. 701 numFiniteBuckets int 702 703 // width is the size of the first (0-th) finite bucket. 704 width float64 705 706 // scale is a factor applied uniformly to the exponential growth portion 707 // of the bucket size. 708 scale float64 709 710 // growth is the exponential growth factor for finite buckets. 711 // The n-th bucket is `growth` times wider than the (n-1)-th bucket. 712 // Bucket sizes are floored, so `width` and `growth` must be large enough 713 // such that the second bucket is actually wider than the first after 714 // flooring (unless, of course, fixed-width buckets are what's desired). 715 growth float64 716 717 // growthLog is math.Log(growth). 718 growthLog float64 719 720 // maxSample is the max sample value which can be represented in a finite 721 // bucket. 722 maxSample int64 723 724 // lowerbounds is a precomputed set of lower bounds of the buckets. 725 // The "underflow" bucket has no lower bound, so it is not included here. 726 // lowerBounds[0] is the lower bound of the first finite bucket, which is 727 // also the upper bound of the underflow bucket. 728 // lowerBounds[numFiniteBuckets] is the lower bound of the overflow bucket. 729 lowerBounds []int64 730 } 731 732 // Minimum/maximum finite buckets for exponential bucketers. 733 const ( 734 exponentialMinBuckets = 1 735 exponentialMaxBuckets = 100 736 ) 737 738 // NewExponentialBucketer returns a new Bucketer with exponential buckets. 739 func NewExponentialBucketer(numFiniteBuckets int, width uint64, scale, growth float64) *ExponentialBucketer { 740 if numFiniteBuckets < exponentialMinBuckets || numFiniteBuckets > exponentialMaxBuckets { 741 panic(fmt.Sprintf("number of finite buckets must be in [%d, %d]", exponentialMinBuckets, exponentialMaxBuckets)) 742 } 743 if scale < 0 || growth < 0 { 744 panic(fmt.Sprintf("scale and growth for exponential buckets must be >0, got scale=%f and growth=%f", scale, growth)) 745 } 746 b := &ExponentialBucketer{ 747 numFiniteBuckets: numFiniteBuckets, 748 width: float64(width), 749 scale: scale, 750 growth: growth, 751 growthLog: math.Log(growth), 752 lowerBounds: make([]int64, numFiniteBuckets+1), 753 } 754 b.lowerBounds[0] = 0 755 for i := 1; i <= numFiniteBuckets; i++ { 756 b.lowerBounds[i] = int64(b.width*float64(i) + b.scale*math.Pow(b.growth, float64(i-1))) 757 if b.lowerBounds[i] < 0 { 758 panic(fmt.Sprintf("encountered bucket width overflow at bucket %d", i)) 759 } 760 } 761 b.maxSample = b.lowerBounds[numFiniteBuckets] - 1 762 return b 763 } 764 765 // NumFiniteBuckets implements Bucketer.NumFiniteBuckets. 766 func (b *ExponentialBucketer) NumFiniteBuckets() int { 767 return int(b.numFiniteBuckets) 768 } 769 770 // LowerBound implements Bucketer.LowerBound. 771 func (b *ExponentialBucketer) LowerBound(bucketIndex int) int64 { 772 return b.lowerBounds[bucketIndex] 773 } 774 775 // BucketIndex implements Bucketer.BucketIndex. 776 // +checkescape:all 777 // 778 //go:nosplit 779 func (b *ExponentialBucketer) BucketIndex(sample int64) int { 780 if sample < 0 { 781 return -1 782 } 783 if sample == 0 { 784 return 0 785 } 786 if sample > b.maxSample { 787 return b.numFiniteBuckets 788 } 789 // Do a binary search. For the number of buckets we expect to deal with in 790 // this code (a few dozen at most), this may be faster than computing a 791 // logarithm. We can't use recursion because this would violate go:nosplit. 792 lowIndex := 0 793 highIndex := b.numFiniteBuckets 794 for { 795 pivotIndex := (highIndex + lowIndex) >> 1 796 lowerBound := b.lowerBounds[pivotIndex] 797 if sample < lowerBound { 798 highIndex = pivotIndex 799 continue 800 } 801 upperBound := b.lowerBounds[pivotIndex+1] 802 if sample >= upperBound { 803 lowIndex = pivotIndex 804 continue 805 } 806 return pivotIndex 807 } 808 } 809 810 // Verify that ExponentialBucketer implements Bucketer. 811 var _ = (Bucketer)((*ExponentialBucketer)(nil)) 812 813 // DistributionMetric represents a distribution of values in finite buckets. 814 // It also separately keeps track of min/max in order to ascertain whether the 815 // buckets can faithfully represent the range of values encountered in the 816 // distribution. 817 type DistributionMetric struct { 818 // exponentialBucketer is the bucketing scheme used for this metric. 819 // Because we need DistributionMetric.AddSample to be go:nosplit-compatible, 820 // we cannot use an interface reference here, as we would not be able to call 821 // it in AddSample. Instead, we need one field per Bucketer implementation, 822 // and we call whichever one is in use in AddSample. 823 exponentialBucketer *ExponentialBucketer 824 825 // metadata is the metadata about this metric. It is immutable. 826 metadata *pb.MetricMetadata 827 828 // prometheusMetric describes the metric in Prometheus format. It is immutable. 829 prometheusMetric *prometheus.Metric 830 831 // fieldsToKey converts a multi-dimensional fields to a single string to use 832 // as key for `samples`. 833 fieldsToKey fieldMapper 834 835 // samples is the number of samples that fell within each bucket. 836 // It is mapped by the concatenation of the fields using `fieldsToKey`. 837 // The value is a list of bucket sample counts, with the 0-th being the 838 // "underflow bucket", i.e. the bucket of samples which cannot fall into 839 // any bucket that the bucketer supports. 840 // The i-th value is the number of samples that fell into the bucketer's 841 // (i-1)-th finite bucket. 842 // The last value is the number of samples that fell into the bucketer's 843 // last (i.e. infinite) bucket. 844 samples [][]atomicbitops.Uint64 845 846 // statistics is a set of statistics about each distribution. 847 // It is mapped by the concatenation of the fields using `fieldsToKey`. 848 statistics []distributionStatistics 849 } 850 851 // NewDistributionMetric creates and registers a new distribution metric. 852 func NewDistributionMetric(name string, sync bool, bucketer Bucketer, unit pb.MetricMetadata_Units, description string, fields ...Field) (*DistributionMetric, error) { 853 if err := verifyName(name); err != nil { 854 return nil, err 855 } 856 if initialized.Load() { 857 return nil, ErrInitializationDone 858 } 859 if _, ok := allMetrics.uint64Metrics[name]; ok { 860 return nil, ErrNameInUse 861 } 862 if _, ok := allMetrics.distributionMetrics[name]; ok { 863 return nil, ErrNameInUse 864 } 865 866 var exponentialBucketer *ExponentialBucketer 867 if expBucketer, ok := bucketer.(*ExponentialBucketer); ok { 868 exponentialBucketer = expBucketer 869 } else { 870 return nil, fmt.Errorf("unsupported bucketer implementation: %T", bucketer) 871 } 872 fieldsToKey, err := newFieldMapper(fields...) 873 if err != nil { 874 return nil, err 875 } 876 877 numFiniteBuckets := bucketer.NumFiniteBuckets() 878 samples := fieldsToKey.makeDistributionSampleMap(numFiniteBuckets + 2) 879 protoFields := make([]*pb.MetricMetadata_Field, len(fields)) 880 for i, f := range fields { 881 protoFields[i] = f.toProto() 882 } 883 lowerBounds := make([]int64, numFiniteBuckets+1) 884 for i := 0; i <= numFiniteBuckets; i++ { 885 lowerBounds[i] = bucketer.LowerBound(i) 886 } 887 allMetrics.distributionMetrics[name] = &DistributionMetric{ 888 exponentialBucketer: exponentialBucketer, 889 fieldsToKey: fieldsToKey, 890 samples: samples, 891 statistics: make([]distributionStatistics, fieldsToKey.numKeys()), 892 metadata: &pb.MetricMetadata{ 893 Name: name, 894 PrometheusName: nameToPrometheusName(name), 895 Description: description, 896 Cumulative: false, 897 Sync: sync, 898 Type: pb.MetricMetadata_TYPE_DISTRIBUTION, 899 Units: unit, 900 Fields: protoFields, 901 DistributionBucketLowerBounds: lowerBounds, 902 }, 903 prometheusMetric: &prometheus.Metric{ 904 Name: nameToPrometheusName(name), 905 Type: prometheus.TypeHistogram, 906 Help: description, 907 }, 908 } 909 return allMetrics.distributionMetrics[name], nil 910 } 911 912 // MustCreateNewDistributionMetric creates and registers a distribution metric. 913 // If an error occurs, it panics. 914 func MustCreateNewDistributionMetric(name string, sync bool, bucketer Bucketer, unit pb.MetricMetadata_Units, description string, fields ...Field) *DistributionMetric { 915 distrib, err := NewDistributionMetric(name, sync, bucketer, unit, description, fields...) 916 if err != nil { 917 panic(err) 918 } 919 return distrib 920 } 921 922 // distributionStatistics is a set of useful statistics for a distribution. 923 // As metric update operations must be non-blocking, this uses a bunch of 924 // atomic numbers rather than a mutex. 925 type distributionStatistics struct { 926 // sampleCount is the total number of samples. 927 sampleCount atomicbitops.Uint64 928 929 // sampleSum is the sum of samples. 930 sampleSum atomicbitops.Int64 931 932 // sumOfSquaredDeviations is the running sum of squared deviations from the 933 // mean of each sample. 934 // This quantity is useful as part of Welford's online algorithm: 935 // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm 936 sumOfSquaredDeviations atomicbitops.Float64 937 938 // min and max are the minimum and maximum samples ever recorded. 939 min, max atomicbitops.Int64 940 } 941 942 // Update updates the distribution statistics with the given sample. 943 // This function must be non-blocking, i.e. no mutexes. 944 // As a result, it is not entirely accurate when it races with itself, 945 // though the imprecision should be fairly small and should not practically 946 // matter for distributions with more than a handful of records. 947 func (s *distributionStatistics) Update(sample int64) { 948 newSampleCount := s.sampleCount.Add(1) 949 newSampleSum := s.sampleSum.Add(sample) 950 951 if newSampleCount > 1 { 952 // Not the first sample of the distribution. 953 floatSample := float64(sample) 954 oldMean := float64(newSampleSum-sample) / float64(newSampleCount-1) 955 newMean := float64(newSampleSum) / float64(newSampleCount) 956 devSquared := (floatSample - oldMean) * (floatSample - newMean) 957 s.sumOfSquaredDeviations.Add(devSquared) 958 959 // Update min and max. 960 // We optimistically load racily here in the hope that it passes the CaS 961 // operation. If it doesn't, we'll load it atomically, so this is not a 962 // race. 963 sync.RaceDisable() 964 for oldMin := s.min.RacyLoad(); sample < oldMin && !s.min.CompareAndSwap(oldMin, sample); oldMin = s.min.Load() { 965 } 966 for oldMax := s.max.RacyLoad(); sample > oldMax && !s.max.CompareAndSwap(oldMax, sample); oldMax = s.max.Load() { 967 } 968 sync.RaceEnable() 969 } else { 970 // We are the first sample, so set the min and max to the current sample. 971 // See above for why disabling race detection is safe here as well. 972 sync.RaceDisable() 973 if !s.min.CompareAndSwap(0, sample) { 974 for oldMin := s.min.RacyLoad(); sample < oldMin && !s.min.CompareAndSwap(oldMin, sample); oldMin = s.min.Load() { 975 } 976 } 977 if !s.max.CompareAndSwap(0, sample) { 978 for oldMax := s.max.RacyLoad(); sample > oldMax && !s.max.CompareAndSwap(oldMax, sample); oldMax = s.max.Load() { 979 } 980 } 981 sync.RaceEnable() 982 } 983 } 984 985 // distributionStatisticsSnapshot an atomically-loaded snapshot of 986 // distributionStatistics. 987 type distributionStatisticsSnapshot struct { 988 // sampleCount is the total number of samples. 989 sampleCount uint64 990 991 // sampleSum is the sum of samples. 992 sampleSum int64 993 994 // sumOfSquaredDeviations is the running sum of squared deviations from the 995 // mean of each sample. 996 // This quantity is useful as part of Welford's online algorithm: 997 // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm 998 sumOfSquaredDeviations float64 999 1000 // min and max are the minimum and maximum samples ever recorded. 1001 min, max int64 1002 } 1003 1004 // Load generates a consistent snapshot of the distribution statistics. 1005 func (s *distributionStatistics) Load() distributionStatisticsSnapshot { 1006 // We start out reading things racily, but will verify each of them 1007 // atomically later in this function, so this is OK. Disable the race 1008 // checker for this part of the function. 1009 sync.RaceDisable() 1010 snapshot := distributionStatisticsSnapshot{ 1011 sampleCount: s.sampleCount.RacyLoad(), 1012 sampleSum: s.sampleSum.RacyLoad(), 1013 sumOfSquaredDeviations: s.sumOfSquaredDeviations.RacyLoad(), 1014 min: s.min.RacyLoad(), 1015 max: s.max.RacyLoad(), 1016 } 1017 sync.RaceEnable() 1018 1019 // Now verify that we loaded an atomic snapshot of the statistics. 1020 // This relies on the fact that each update should at least change the 1021 // count statistic, so we should be able to tell if anything changed based 1022 // on whether we have an exact match with the currently-loaded values. 1023 // If not, we reload that value and try again until all is consistent. 1024 retry: 1025 if sampleCount := s.sampleCount.Load(); sampleCount != snapshot.sampleCount { 1026 snapshot.sampleCount = sampleCount 1027 goto retry 1028 } 1029 if sampleSum := s.sampleSum.Load(); sampleSum != snapshot.sampleSum { 1030 snapshot.sampleSum = sampleSum 1031 goto retry 1032 } 1033 if ssd := s.sumOfSquaredDeviations.Load(); ssd != snapshot.sumOfSquaredDeviations { 1034 snapshot.sumOfSquaredDeviations = ssd 1035 goto retry 1036 } 1037 if min := s.min.Load(); min != snapshot.min { 1038 snapshot.min = min 1039 goto retry 1040 } 1041 if max := s.max.Load(); max != snapshot.max { 1042 snapshot.max = max 1043 goto retry 1044 } 1045 return snapshot 1046 } 1047 1048 // AddSample adds a sample to the distribution. 1049 // This *must* be called with the correct number of fields, or it will panic. 1050 // +checkescape:all 1051 // 1052 //go:nosplit 1053 func (d *DistributionMetric) AddSample(sample int64, fields ...*FieldValue) { 1054 d.addSampleByKey(sample, d.fieldsToKey.lookup(fields...)) 1055 } 1056 1057 // addSampleByKey works like AddSample, with the field key already known. 1058 // +checkescape:all 1059 // 1060 //go:nosplit 1061 func (d *DistributionMetric) addSampleByKey(sample int64, key int) { 1062 bucket := d.exponentialBucketer.BucketIndex(sample) 1063 d.samples[key][bucket+1].Add(1) 1064 d.statistics[key].Update(sample) 1065 } 1066 1067 // Minimum number of buckets for NewDurationBucket. 1068 const durationMinBuckets = 3 1069 1070 // NewDurationBucketer returns a Bucketer well-suited for measuring durations in 1071 // nanoseconds. Useful for NewTimerMetric. 1072 // minDuration and maxDuration are conservative estimates of the minimum and 1073 // maximum durations expected to be accurately measured by the Bucketer. 1074 func NewDurationBucketer(numFiniteBuckets int, minDuration, maxDuration time.Duration) Bucketer { 1075 if numFiniteBuckets < durationMinBuckets { 1076 panic(fmt.Sprintf("duration bucketer must have at least %d buckets, got %d", durationMinBuckets, numFiniteBuckets)) 1077 } 1078 minNs := minDuration.Nanoseconds() 1079 exponentCoversNs := float64(maxDuration.Nanoseconds()-int64(numFiniteBuckets-durationMinBuckets)*minNs) / float64(minNs) 1080 exponent := math.Log(exponentCoversNs) / math.Log(float64(numFiniteBuckets-durationMinBuckets)) 1081 minNs = int64(float64(minNs) / exponent) 1082 return NewExponentialBucketer(numFiniteBuckets, uint64(minNs), float64(minNs), exponent) 1083 } 1084 1085 // TimerMetric wraps a distribution metric with convenience functions for 1086 // latency measurements, which is a popular specialization of distribution 1087 // metrics. 1088 type TimerMetric struct { 1089 DistributionMetric 1090 } 1091 1092 // NewTimerMetric provides a convenient way to measure latencies. 1093 // The arguments are the same as `NewDistributionMetric`, except: 1094 // - `nanoBucketer`: Same as `NewDistribution`'s `bucketer`, expected to hold 1095 // durations in nanoseconds. Adjust parameters accordingly. 1096 // NewDurationBucketer may be helpful here. 1097 func NewTimerMetric(name string, nanoBucketer Bucketer, description string, fields ...Field) (*TimerMetric, error) { 1098 distrib, err := NewDistributionMetric(name, false, nanoBucketer, pb.MetricMetadata_UNITS_NANOSECONDS, description, fields...) 1099 if err != nil { 1100 return nil, err 1101 } 1102 return &TimerMetric{ 1103 DistributionMetric: *distrib, 1104 }, nil 1105 } 1106 1107 // MustCreateNewTimerMetric creates and registers a timer metric. 1108 // If an error occurs, it panics. 1109 func MustCreateNewTimerMetric(name string, nanoBucketer Bucketer, description string, fields ...Field) *TimerMetric { 1110 timer, err := NewTimerMetric(name, nanoBucketer, description, fields...) 1111 if err != nil { 1112 panic(err) 1113 } 1114 return timer 1115 } 1116 1117 // TimedOperation is used by TimerMetric to keep track of the time elapsed 1118 // between an operation starting and stopping. 1119 type TimedOperation struct { 1120 // metric is a reference to the timer metric for the operation. 1121 metric *TimerMetric 1122 1123 // partialFields is a prefix of the fields used in this operation. 1124 // The rest of the fields is provided in TimedOperation.Finish. 1125 partialFields []*FieldValue 1126 1127 // startedNs is the number of nanoseconds measured in TimerMetric.Start(). 1128 startedNs int64 1129 } 1130 1131 // Start starts a timer measurement for the given combination of fields. 1132 // It returns a TimedOperation which can be passed around as necessary to 1133 // measure the duration of the operation. 1134 // Once the operation is finished, call Finish on the TimedOperation. 1135 // The fields passed to Start may be partially specified; if so, the remaining 1136 // fields must be passed to TimedOperation.Finish. This is useful for cases 1137 // where which path an operation took is only known after it happens. This 1138 // path can be part of the fields passed to Finish. 1139 // +checkescape:all 1140 // 1141 //go:nosplit 1142 func (t *TimerMetric) Start(fields ...*FieldValue) TimedOperation { 1143 return TimedOperation{ 1144 metric: t, 1145 partialFields: fields, 1146 startedNs: CheapNowNano(), 1147 } 1148 } 1149 1150 // Finish marks an operation as finished and records its duration. 1151 // `extraFields` is the rest of the fields appended to the fields passed to 1152 // `TimerMetric.Start`. The concatenation of these two must be the exact 1153 // number of fields that the underlying metric has. 1154 // +checkescape:all 1155 // 1156 //go:nosplit 1157 func (o TimedOperation) Finish(extraFields ...*FieldValue) { 1158 ended := CheapNowNano() 1159 fieldKey := o.metric.fieldsToKey.lookupConcat(o.partialFields, extraFields) 1160 o.metric.addSampleByKey(ended-o.startedNs, fieldKey) 1161 } 1162 1163 // stageTiming contains timing data for an initialization stage. 1164 type stageTiming struct { 1165 stage InitStage 1166 started time.Time 1167 // ended is the zero time when the stage has not ended yet. 1168 ended time.Time 1169 } 1170 1171 // inProgress returns whether this stage hasn't ended yet. 1172 func (s stageTiming) inProgress() bool { 1173 return !s.started.IsZero() && s.ended.IsZero() 1174 } 1175 1176 // metricSet holds metric data. 1177 type metricSet struct { 1178 // Metric registration data for all the metrics below. 1179 registration *pb.MetricRegistration 1180 1181 // Map of uint64 metrics. 1182 uint64Metrics map[string]customUint64Metric 1183 1184 // Map of distribution metrics. 1185 distributionMetrics map[string]*DistributionMetric 1186 1187 // mu protects the fields below. 1188 mu sync.RWMutex 1189 1190 // Information about the stages reached by the Sentry. Only appended to, so 1191 // reading a shallow copy of the slice header concurrently is safe. 1192 finished []stageTiming 1193 1194 // The current stage in progress. 1195 currentStage stageTiming 1196 } 1197 1198 // makeMetricSet returns a new metricSet. 1199 func makeMetricSet() *metricSet { 1200 return &metricSet{ 1201 uint64Metrics: make(map[string]customUint64Metric), 1202 distributionMetrics: make(map[string]*DistributionMetric), 1203 finished: make([]stageTiming, 0, len(allStages)), 1204 } 1205 } 1206 1207 // Values returns a snapshot of all values in m. 1208 func (m *metricSet) Values() metricValues { 1209 m.mu.Lock() 1210 stages := m.finished[:] 1211 m.mu.Unlock() 1212 1213 vals := metricValues{ 1214 uint64Metrics: make(map[string]any, len(m.uint64Metrics)), 1215 distributionMetrics: make(map[string][][]uint64, len(m.distributionMetrics)), 1216 distributionTotalSamples: make(map[string][]uint64, len(m.distributionMetrics)), 1217 distributionStatistics: make(map[string][]distributionStatisticsSnapshot, len(m.distributionMetrics)), 1218 stages: stages, 1219 } 1220 for k, v := range m.uint64Metrics { 1221 fields := v.fields 1222 switch len(fields) { 1223 case 0: 1224 vals.uint64Metrics[k] = v.value() 1225 case 1: 1226 fieldsMap := make(map[*FieldValue]uint64) 1227 if v.forEachNonZero != nil { 1228 v.forEachNonZero(func(fieldValues []*FieldValue, val uint64) { 1229 fieldsMap[fieldValues[0]] = val 1230 }) 1231 } else { 1232 for _, fieldValue := range fields[0].values { 1233 fieldsMap[fieldValue] = v.value(fieldValue) 1234 } 1235 } 1236 vals.uint64Metrics[k] = fieldsMap 1237 default: 1238 panic(fmt.Sprintf("Unsupported number of metric fields: %d", len(fields))) 1239 } 1240 } 1241 for name, metric := range m.distributionMetrics { 1242 fieldKeysToValues := make([][]uint64, len(metric.samples)) 1243 fieldKeysToTotalSamples := make([]uint64, len(metric.samples)) 1244 fieldKeysToStatistics := make([]distributionStatisticsSnapshot, len(metric.samples)) 1245 for fieldKey, samples := range metric.samples { 1246 samplesSnapshot := snapshotDistribution(samples) 1247 totalSamples := uint64(0) 1248 for _, bucket := range samplesSnapshot { 1249 totalSamples += bucket 1250 } 1251 if totalSamples == 0 { 1252 // No samples recorded for this combination of field, so leave 1253 // the maps for this fieldKey as nil. This lessens the memory cost 1254 // of distributions with unused field combinations. 1255 fieldKeysToTotalSamples[fieldKey] = 0 1256 fieldKeysToStatistics[fieldKey] = distributionStatisticsSnapshot{} 1257 fieldKeysToValues[fieldKey] = nil 1258 } else { 1259 fieldKeysToTotalSamples[fieldKey] = totalSamples 1260 fieldKeysToStatistics[fieldKey] = metric.statistics[fieldKey].Load() 1261 fieldKeysToValues[fieldKey] = samplesSnapshot 1262 } 1263 } 1264 vals.distributionMetrics[name] = fieldKeysToValues 1265 vals.distributionTotalSamples[name] = fieldKeysToTotalSamples 1266 vals.distributionStatistics[name] = fieldKeysToStatistics 1267 } 1268 return vals 1269 } 1270 1271 // metricValues contains a copy of the values of all metrics. 1272 type metricValues struct { 1273 // uint64Metrics is a map of uint64 metrics, 1274 // with key as metric name. Value can be either uint64, or map[*FieldValue]uint64 1275 // to support metrics with one field. 1276 uint64Metrics map[string]any 1277 1278 // distributionMetrics is a map of distribution metrics. 1279 // The first key level is the metric name. 1280 // The second key level is an index ID corresponding to the combination of 1281 // field values. The index is decoded to field strings using keyToMultiField. 1282 // The slice value is the number of samples in each bucket of the 1283 // distribution, with the first (0-th) element being the underflow bucket 1284 // and the last element being the "infinite" (overflow) bucket. 1285 // The slice value may also be nil for field combinations with no samples. 1286 // This saves memory by avoiding storing anything for unused field 1287 // combinations. 1288 distributionMetrics map[string][][]uint64 1289 1290 // distributionTotalSamples is the total number of samples for each 1291 // distribution metric and field values. 1292 // It allows performing a quick diff between snapshots without having to 1293 // iterate over all the buckets individually, so that distributions with 1294 // no new samples are not retransmitted. 1295 distributionTotalSamples map[string][]uint64 1296 1297 // distributionStatistics is a set of statistics about the samples. 1298 distributionStatistics map[string][]distributionStatisticsSnapshot 1299 1300 // Information on when initialization stages were reached. Does not include 1301 // the currently-ongoing stage, if any. 1302 stages []stageTiming 1303 } 1304 1305 var ( 1306 // emitMu protects metricsAtLastEmit and ensures that all emitted 1307 // metrics are strongly ordered (older metrics are never emitted after 1308 // newer metrics). 1309 emitMu sync.Mutex 1310 1311 // metricsAtLastEmit contains the state of the metrics at the last emit event. 1312 metricsAtLastEmit metricValues 1313 ) 1314 1315 // EmitMetricUpdate emits a MetricUpdate over the event channel. 1316 // 1317 // Only metrics that have changed since the last call are emitted. 1318 // 1319 // EmitMetricUpdate is thread-safe. 1320 // 1321 // Preconditions: 1322 // - Initialize has been called. 1323 func EmitMetricUpdate() { 1324 emitMu.Lock() 1325 defer emitMu.Unlock() 1326 1327 snapshot := allMetrics.Values() 1328 1329 m := pb.MetricUpdate{} 1330 // On the first call metricsAtLastEmit will be empty. Include all 1331 // metrics then. 1332 for k, v := range snapshot.uint64Metrics { 1333 prev, ok := metricsAtLastEmit.uint64Metrics[k] 1334 switch t := v.(type) { 1335 case uint64: 1336 // Metric exists and value did not change. 1337 if ok && prev.(uint64) == t { 1338 continue 1339 } 1340 1341 m.Metrics = append(m.Metrics, &pb.MetricValue{ 1342 Name: k, 1343 Value: &pb.MetricValue_Uint64Value{Uint64Value: t}, 1344 }) 1345 case map[*FieldValue]uint64: 1346 for fieldValue, metricValue := range t { 1347 // Emit data on the first call only if the field 1348 // value has been incremented. For all other 1349 // calls, emit data if the field value has been 1350 // changed from the previous emit. 1351 if (!ok && metricValue == 0) || (ok && prev.(map[*FieldValue]uint64)[fieldValue] == metricValue) { 1352 continue 1353 } 1354 1355 m.Metrics = append(m.Metrics, &pb.MetricValue{ 1356 Name: k, 1357 FieldValues: []string{fieldValue.Value}, 1358 Value: &pb.MetricValue_Uint64Value{Uint64Value: metricValue}, 1359 }) 1360 } 1361 default: 1362 panic(fmt.Sprintf("unsupported type in uint64Metrics: %T (%v)", v, v)) 1363 } 1364 } 1365 for name, dist := range snapshot.distributionTotalSamples { 1366 prev, ok := metricsAtLastEmit.distributionTotalSamples[name] 1367 for fieldKey, currentTotal := range dist { 1368 if currentTotal == 0 { 1369 continue 1370 } 1371 if ok { 1372 if prevTotal := prev[fieldKey]; prevTotal == currentTotal { 1373 continue 1374 } 1375 } 1376 oldSamples := metricsAtLastEmit.distributionMetrics[name] 1377 var newSamples []uint64 1378 if oldSamples != nil && oldSamples[fieldKey] != nil { 1379 currentSamples := snapshot.distributionMetrics[name][fieldKey] 1380 numBuckets := len(currentSamples) 1381 newSamples = make([]uint64, numBuckets) 1382 for i := 0; i < numBuckets; i++ { 1383 newSamples[i] = currentSamples[i] - oldSamples[fieldKey][i] 1384 } 1385 } else { 1386 // oldSamples == nil means that the previous snapshot has no samples. 1387 // This means the delta is the current number of samples, no need for 1388 // a copy. 1389 newSamples = snapshot.distributionMetrics[name][fieldKey] 1390 } 1391 m.Metrics = append(m.Metrics, &pb.MetricValue{ 1392 Name: name, 1393 FieldValues: allMetrics.distributionMetrics[name].fieldsToKey.keyToMultiField(fieldKey), 1394 Value: &pb.MetricValue_DistributionValue{ 1395 DistributionValue: &pb.Samples{ 1396 NewSamples: newSamples, 1397 }, 1398 }, 1399 }) 1400 } 1401 } 1402 1403 for s := len(metricsAtLastEmit.stages); s < len(snapshot.stages); s++ { 1404 newStage := snapshot.stages[s] 1405 m.StageTiming = append(m.StageTiming, &pb.StageTiming{ 1406 Stage: string(newStage.stage), 1407 Started: ×tamppb.Timestamp{ 1408 Seconds: newStage.started.Unix(), 1409 Nanos: int32(newStage.started.Nanosecond()), 1410 }, 1411 Ended: ×tamppb.Timestamp{ 1412 Seconds: newStage.ended.Unix(), 1413 Nanos: int32(newStage.ended.Nanosecond()), 1414 }, 1415 }) 1416 } 1417 1418 metricsAtLastEmit = snapshot 1419 if len(m.Metrics) == 0 && len(m.StageTiming) == 0 { 1420 return 1421 } 1422 1423 if log.IsLogging(log.Debug) { 1424 sort.Slice(m.Metrics, func(i, j int) bool { 1425 return m.Metrics[i].GetName() < m.Metrics[j].GetName() 1426 }) 1427 log.Debugf("Emitting metrics:") 1428 for _, metric := range m.Metrics { 1429 var valueStr string 1430 switch metric.GetValue().(type) { 1431 case *pb.MetricValue_Uint64Value: 1432 valueStr = fmt.Sprintf("%d", metric.GetUint64Value()) 1433 case *pb.MetricValue_DistributionValue: 1434 valueStr = fmt.Sprintf("new distribution samples: %+v", metric.GetDistributionValue()) 1435 default: 1436 valueStr = "unsupported type" 1437 } 1438 if len(metric.GetFieldValues()) > 0 { 1439 var foundMetadata *pb.MetricMetadata 1440 if metricObj, found := allMetrics.uint64Metrics[metric.GetName()]; found { 1441 foundMetadata = metricObj.metadata 1442 } else if metricObj, found := allMetrics.distributionMetrics[metric.GetName()]; found { 1443 foundMetadata = metricObj.metadata 1444 } 1445 if foundMetadata == nil || len(foundMetadata.GetFields()) != len(metric.GetFieldValues()) { 1446 // This should never happen, but if it somehow does, we don't want to crash here, as 1447 // this is debug output that may already be printed in the context of panic. 1448 log.Debugf("%s%v (cannot find metric definition!): %s", metric.GetName(), metric.GetFieldValues(), valueStr) 1449 continue 1450 } 1451 var sb strings.Builder 1452 for i, fieldValue := range metric.GetFieldValues() { 1453 if i > 0 { 1454 sb.WriteRune(',') 1455 } 1456 sb.WriteString(foundMetadata.GetFields()[i].GetFieldName()) 1457 sb.WriteRune('=') 1458 sb.WriteString(fieldValue) 1459 } 1460 log.Debugf(" Metric %s[%s]: %s", metric.GetName(), sb.String(), valueStr) 1461 } else { 1462 log.Debugf(" Metric %s: %s", metric.GetName(), valueStr) 1463 } 1464 } 1465 for _, stage := range m.StageTiming { 1466 duration := time.Duration(stage.Ended.Seconds-stage.Started.Seconds)*time.Second + time.Duration(stage.Ended.Nanos-stage.Started.Nanos)*time.Nanosecond 1467 log.Debugf("Stage %s took %v", stage.GetStage(), duration) 1468 } 1469 } 1470 1471 if err := eventchannel.Emit(&m); err != nil { 1472 log.Warningf("Unable to emit metrics: %s", err) 1473 } 1474 } 1475 1476 // SnapshotOptions controls how snapshots are exported in GetSnapshot. 1477 type SnapshotOptions struct { 1478 // Filter, if set, should return true for metrics that should be written to 1479 // the snapshot. If unset, all metrics are written to the snapshot. 1480 Filter func(*prometheus.Metric) bool 1481 } 1482 1483 // GetSnapshot returns a Prometheus snapshot of the metric data. 1484 // Returns ErrNotYetInitialized if metrics have not yet been initialized. 1485 func GetSnapshot(options SnapshotOptions) (*prometheus.Snapshot, error) { 1486 if !initialized.Load() { 1487 return nil, ErrNotYetInitialized 1488 } 1489 values := allMetrics.Values() 1490 snapshot := prometheus.NewSnapshot() 1491 for k, v := range values.uint64Metrics { 1492 m := allMetrics.uint64Metrics[k] 1493 if options.Filter != nil && !options.Filter(m.prometheusMetric) { 1494 continue 1495 } 1496 switch t := v.(type) { 1497 case uint64: 1498 if m.metadata.GetCumulative() && t == 0 { 1499 // Zero-valued counter, ignore. 1500 continue 1501 } 1502 snapshot.Add(prometheus.NewIntData(m.prometheusMetric, int64(t))) 1503 case map[*FieldValue]uint64: 1504 for fieldValue, metricValue := range t { 1505 if m.metadata.GetCumulative() && metricValue == 0 { 1506 // Zero-valued counter, ignore. 1507 continue 1508 } 1509 snapshot.Add(prometheus.LabeledIntData(m.prometheusMetric, map[string]string{ 1510 // uint64 metrics currently only support at most one field name. 1511 m.metadata.Fields[0].GetFieldName(): fieldValue.Value, 1512 }, int64(metricValue))) 1513 } 1514 default: 1515 panic(fmt.Sprintf("unsupported type in uint64Metrics: %T (%v)", v, v)) 1516 } 1517 } 1518 for k, dists := range values.distributionTotalSamples { 1519 m := allMetrics.distributionMetrics[k] 1520 if options.Filter != nil && !options.Filter(m.prometheusMetric) { 1521 continue 1522 } 1523 distributionSamples := values.distributionMetrics[k] 1524 numFiniteBuckets := m.exponentialBucketer.NumFiniteBuckets() 1525 statistics := values.distributionStatistics[k] 1526 for fieldKey := range dists { 1527 var labels map[string]string 1528 if numFields := m.fieldsToKey.numKeys(); numFields > 0 { 1529 labels = make(map[string]string, numFields) 1530 for fieldIndex, field := range m.fieldsToKey.keyToMultiField(fieldKey) { 1531 labels[m.metadata.Fields[fieldIndex].GetFieldName()] = field 1532 } 1533 } 1534 currentSamples := distributionSamples[fieldKey] 1535 buckets := make([]prometheus.Bucket, numFiniteBuckets+2) 1536 samplesForFieldKey := uint64(0) 1537 for b := 0; b < numFiniteBuckets+2; b++ { 1538 var upperBound prometheus.Number 1539 if b == numFiniteBuckets+1 { 1540 upperBound = prometheus.Number{Float: math.Inf(1)} // Overflow bucket. 1541 } else { 1542 upperBound = prometheus.Number{Int: m.exponentialBucketer.LowerBound(b)} 1543 } 1544 samples := uint64(0) 1545 if currentSamples != nil { 1546 samples = currentSamples[b] 1547 samplesForFieldKey += samples 1548 } 1549 buckets[b] = prometheus.Bucket{ 1550 Samples: samples, 1551 UpperBound: upperBound, 1552 } 1553 } 1554 if samplesForFieldKey == 0 { 1555 // Zero-valued distribution (no samples in any bucket for this field 1556 // combination). Ignore. 1557 continue 1558 } 1559 snapshot.Add(&prometheus.Data{ 1560 Metric: m.prometheusMetric, 1561 Labels: labels, 1562 HistogramValue: &prometheus.Histogram{ 1563 Total: prometheus.Number{Int: statistics[fieldKey].sampleSum}, 1564 SumOfSquaredDeviations: prometheus.Number{Float: statistics[fieldKey].sumOfSquaredDeviations}, 1565 Min: prometheus.Number{Int: statistics[fieldKey].min}, 1566 Max: prometheus.Number{Int: statistics[fieldKey].max}, 1567 Buckets: buckets, 1568 }, 1569 }) 1570 } 1571 } 1572 return snapshot, nil 1573 } 1574 1575 // StartStage should be called when an initialization stage is started. 1576 // It returns a function that must be called to indicate that the stage ended. 1577 // Alternatively, future calls to StartStage will implicitly indicate that the 1578 // previous stage ended. 1579 // Stage information will be emitted in the next call to EmitMetricUpdate after 1580 // a stage has ended. 1581 // 1582 // This function may (and is expected to) be called prior to final 1583 // initialization of this metric library, as it has to capture early stages 1584 // of Sentry initialization. 1585 func StartStage(stage InitStage) func() { 1586 now := time.Now() 1587 allMetrics.mu.Lock() 1588 defer allMetrics.mu.Unlock() 1589 if allMetrics.currentStage.inProgress() { 1590 endStage(now) 1591 } 1592 allMetrics.currentStage.stage = stage 1593 allMetrics.currentStage.started = now 1594 return func() { 1595 now := time.Now() 1596 allMetrics.mu.Lock() 1597 defer allMetrics.mu.Unlock() 1598 // The current stage may have been ended by another call to StartStage, so 1599 // double-check prior to clearing the current stage. 1600 if allMetrics.currentStage.inProgress() && allMetrics.currentStage.stage == stage { 1601 endStage(now) 1602 } 1603 } 1604 } 1605 1606 // endStage marks allMetrics.currentStage as ended, adding it to the list of 1607 // finished stages. It assumes allMetrics.mu is locked. 1608 func endStage(when time.Time) { 1609 allMetrics.currentStage.ended = when 1610 allMetrics.finished = append(allMetrics.finished, allMetrics.currentStage) 1611 allMetrics.currentStage = stageTiming{} 1612 }