go.temporal.io/server@v1.23.0/common/metrics/config.go (about) 1 // The MIT License 2 // 3 // Copyright (c) 2020 Temporal Technologies Inc. All rights reserved. 4 // 5 // Copyright (c) 2020 Uber Technologies, Inc. 6 // 7 // Permission is hereby granted, free of charge, to any person obtaining a copy 8 // of this software and associated documentation files (the "Software"), to deal 9 // in the Software without restriction, including without limitation the rights 10 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 // copies of the Software, and to permit persons to whom the Software is 12 // furnished to do so, subject to the following conditions: 13 // 14 // The above copyright notice and this permission notice shall be included in 15 // all copies or substantial portions of the Software. 16 // 17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 // THE SOFTWARE. 24 25 package metrics 26 27 import ( 28 "errors" 29 "fmt" 30 "time" 31 32 "github.com/cactus/go-statsd-client/v5/statsd" 33 prom "github.com/prometheus/client_golang/prometheus" 34 "github.com/uber-go/tally/v4" 35 "github.com/uber-go/tally/v4/m3" 36 "github.com/uber-go/tally/v4/prometheus" 37 "golang.org/x/exp/maps" 38 39 "go.temporal.io/server/common/log" 40 "go.temporal.io/server/common/log/tag" 41 statsdreporter "go.temporal.io/server/common/metrics/tally/statsd" 42 ) 43 44 type ( 45 // Config contains the config items for metrics subsystem 46 Config struct { 47 ClientConfig `yaml:"clientConfig,inline"` 48 49 // M3 is the configuration for m3 metrics reporter 50 M3 *m3.Configuration `yaml:"m3"` 51 // Statsd is the configuration for statsd reporter 52 Statsd *StatsdConfig `yaml:"statsd"` 53 // Prometheus is the configuration for prometheus reporter 54 Prometheus *PrometheusConfig `yaml:"prometheus"` 55 } 56 57 ClientConfig struct { 58 // Tags is the set of key-value pairs to be reported as part of every metric 59 Tags map[string]string `yaml:"tags"` 60 // ExcludeTags is a map from tag name string to tag values string list. 61 // Each value present in keys will have relevant tag value replaced with "_tag_excluded_" 62 // Each value in values list will white-list tag values to be reported as usual. 63 ExcludeTags map[string][]string `yaml:"excludeTags"` 64 // Prefix sets the prefix to all outgoing metrics 65 Prefix string `yaml:"prefix"` 66 67 // DefaultHistogramBoundaries defines the default histogram bucket 68 // boundaries. 69 // Configuration of histogram boundaries for given metric unit. 70 // 71 // Supported values: 72 // - "dimensionless" 73 // - "milliseconds" 74 // - "bytes" 75 PerUnitHistogramBoundaries map[string][]float64 `yaml:"perUnitHistogramBoundaries"` 76 } 77 78 // StatsdConfig contains the config items for statsd metrics reporter 79 StatsdConfig struct { 80 // The host and port of the statsd server 81 HostPort string `yaml:"hostPort" validate:"nonzero"` 82 // The prefix to use in reporting to statsd 83 Prefix string `yaml:"prefix" validate:"nonzero"` 84 // FlushInterval is the maximum interval for sending packets. 85 // If it is not specified, it defaults to 1 second. 86 FlushInterval time.Duration `yaml:"flushInterval"` 87 // FlushBytes specifies the maximum udp packet size you wish to send. 88 // If FlushBytes is unspecified, it defaults to 1432 bytes, which is 89 // considered safe for local traffic. 90 FlushBytes int `yaml:"flushBytes"` 91 // Reporter allows additional configuration of the stats reporter, e.g. with custom tagging options. 92 Reporter StatsdReporterConfig `yaml:"reporter"` 93 } 94 95 StatsdReporterConfig struct { 96 // TagSeparator allows tags to be appended with a separator. If not specified tag keys and values 97 // are embedded to the stat name directly. 98 TagSeparator string `yaml:"tagSeparator"` 99 } 100 101 // PrometheusConfig is a new format for config for prometheus metrics. 102 PrometheusConfig struct { 103 // Metric framework: Tally/OpenTelemetry 104 Framework string `yaml:"framework"` 105 // Address for prometheus to serve metrics from. 106 ListenAddress string `yaml:"listenAddress"` 107 108 // HandlerPath if specified will be used instead of using the default 109 // HTTP handler path "/metrics". 110 HandlerPath string `yaml:"handlerPath"` 111 112 // Configs below are kept for backwards compatibility with previously exposed tally prometheus.Configuration. 113 114 // Deprecated. ListenNetwork if specified will be used instead of using tcp network. 115 // Supported networks: tcp, tcp4, tcp6 and unix. 116 ListenNetwork string `yaml:"listenNetwork"` 117 118 // Deprecated. TimerType is the default Prometheus type to use for Tally timers. 119 // TimerType is always histogram. 120 TimerType string `yaml:"timerType"` 121 122 // Deprecated. Please use PerUnitHistogramBoundaries in ClientConfig. 123 // DefaultHistogramBoundaries defines the default histogram bucket boundaries for tally timer metrics. 124 DefaultHistogramBoundaries []float64 `yaml:"defaultHistogramBoundaries"` 125 126 // Deprecated. Please use PerUnitHistogramBoundaries in ClientConfig. 127 // DefaultHistogramBuckets if specified will set the default histogram 128 // buckets to be used by the reporter for tally timer metrics. 129 // The unit for value specified is Second. 130 // If specified, will override DefaultSummaryObjectives and PerUnitHistogramBoundaries["milliseconds"]. 131 DefaultHistogramBuckets []HistogramObjective `yaml:"defaultHistogramBuckets"` 132 133 // Deprecated. DefaultSummaryObjectives if specified will set the default summary 134 // objectives to be used by the reporter. 135 // The unit for value specified is Second. 136 // If specified, will override PerUnitHistogramBoundaries["milliseconds"]. 137 DefaultSummaryObjectives []SummaryObjective `yaml:"defaultSummaryObjectives"` 138 139 // Deprecated. OnError specifies what to do when an error either with listening 140 // on the specified listen address or registering a metric with the 141 // Prometheus. By default the registerer will panic. 142 OnError string `yaml:"onError"` 143 144 // Deprecated. SanitizeOptions is an optional field that enables a user to 145 // specify which characters are valid and/or should be replaced before metrics 146 // are emitted. 147 SanitizeOptions *SanitizeOptions `yaml:"sanitizeOptions"` 148 } 149 ) 150 151 // Deprecated. HistogramObjective is a Prometheus histogram bucket. 152 // Added for backwards compatibility. 153 type HistogramObjective struct { 154 Upper float64 `yaml:"upper"` 155 } 156 157 // Deprecated. SummaryObjective is a Prometheus summary objective. 158 // Added for backwards compatibility. 159 type SummaryObjective struct { 160 Percentile float64 `yaml:"percentile"` 161 AllowedError float64 `yaml:"allowedError"` 162 } 163 164 type SanitizeRange struct { 165 StartRange string `yaml:"startRange"` 166 EndRange string `yaml:"endRange"` 167 } 168 169 type ValidCharacters struct { 170 Ranges []SanitizeRange `yaml:"ranges"` 171 SafeCharacters string `yaml:"safeChars"` 172 } 173 174 type SanitizeOptions struct { 175 NameCharacters *ValidCharacters `yaml:"nameChars"` 176 KeyCharacters *ValidCharacters `yaml:"keyChars"` 177 ValueCharacters *ValidCharacters `yaml:"valueChars"` 178 ReplacementCharacter string `yaml:"replacementChar"` 179 } 180 181 // Supported framework types 182 const ( 183 // FrameworkTally tally framework id 184 FrameworkTally = "tally" 185 // FrameworkOpentelemetry OpenTelemetry framework id 186 FrameworkOpentelemetry = "opentelemetry" 187 ) 188 189 // Valid unit name for PerUnitHistogramBoundaries config field 190 const ( 191 UnitNameDimensionless = "dimensionless" 192 UnitNameMilliseconds = "milliseconds" 193 UnitNameBytes = "bytes" 194 ) 195 196 // tally sanitizer options that satisfy both Prometheus and M3 restrictions. 197 // This will rename metrics at the tally emission level, so metrics name we 198 // use maybe different from what gets emitted. In the current implementation 199 // it will replace - and . with _ 200 // We should still ensure that the base metrics are prometheus compatible, 201 // but this is necessary as the same prom client initialization is used by 202 // our system workflows. 203 var ( 204 safeCharacters = []rune{'_'} 205 206 defaultTallySanitizeOptions = tally.SanitizeOptions{ 207 NameCharacters: tally.ValidCharacters{ 208 Ranges: tally.AlphanumericRange, 209 Characters: safeCharacters, 210 }, 211 KeyCharacters: tally.ValidCharacters{ 212 Ranges: tally.AlphanumericRange, 213 Characters: safeCharacters, 214 }, 215 ValueCharacters: tally.ValidCharacters{ 216 Ranges: tally.AlphanumericRange, 217 Characters: safeCharacters, 218 }, 219 ReplacementCharacter: tally.DefaultReplacementCharacter, 220 } 221 222 defaultPerUnitHistogramBoundaries = map[string][]float64{ 223 Dimensionless: { 224 1, 225 2, 226 5, 227 10, 228 20, 229 50, 230 100, 231 200, 232 500, 233 1_000, 234 2_000, 235 5_000, 236 10_000, 237 20_000, 238 50_000, 239 100_000, 240 }, 241 Milliseconds: { 242 1, 243 2, 244 5, 245 10, 246 20, 247 50, 248 100, 249 200, 250 500, 251 1_000, // 1s 252 2_000, 253 5_000, 254 10_000, // 10s 255 20_000, 256 50_000, 257 100_000, // 100s = 1m40s 258 200_000, 259 500_000, 260 1_000_000, // 1000s = 16m40s 261 }, 262 Bytes: { 263 1024, 264 2048, 265 4096, 266 8192, 267 16384, 268 32768, 269 65536, 270 131072, 271 262144, 272 524288, 273 1048576, 274 2097152, 275 4194304, 276 8388608, 277 16777216, 278 }, 279 } 280 ) 281 282 // NewScope builds a new tally scope for this metrics configuration 283 // 284 // If the underlying configuration is valid for multiple reporter types, 285 // only one of them will be used for reporting. 286 // 287 // Current priority order is: 288 // statsd > prometheus 289 func NewScope(logger log.Logger, c *Config) tally.Scope { 290 if c.Statsd != nil { 291 return newStatsdScope(logger, c) 292 } 293 if c.Prometheus != nil { 294 sanitizeOptions, err := convertSanitizeOptionsToTally(c.Prometheus) 295 if err != nil { 296 logger.Fatal("invalid sanitize options input on prometheus config", tag.Error(err)) 297 return nil 298 } 299 300 return newPrometheusScope( 301 logger, 302 convertPrometheusConfigToTally(&c.ClientConfig, c.Prometheus), 303 sanitizeOptions, 304 &c.ClientConfig, 305 ) 306 } 307 return tally.NoopScope 308 } 309 310 func convertSanitizeOptionsToTally(config *PrometheusConfig) (tally.SanitizeOptions, error) { 311 if config.SanitizeOptions == nil { 312 return defaultTallySanitizeOptions, nil 313 } 314 315 return config.SanitizeOptions.toTally() 316 } 317 318 func convertPrometheusConfigToTally( 319 clientConfig *ClientConfig, 320 config *PrometheusConfig, 321 ) *prometheus.Configuration { 322 defaultObjectives := make([]prometheus.SummaryObjective, len(config.DefaultSummaryObjectives)) 323 for i, item := range config.DefaultSummaryObjectives { 324 defaultObjectives[i].AllowedError = item.AllowedError 325 defaultObjectives[i].Percentile = item.Percentile 326 } 327 328 return &prometheus.Configuration{ 329 HandlerPath: config.HandlerPath, 330 ListenNetwork: config.ListenNetwork, 331 ListenAddress: config.ListenAddress, 332 TimerType: "histogram", 333 DefaultHistogramBuckets: buildTallyTimerHistogramBuckets(clientConfig, config), 334 DefaultSummaryObjectives: defaultObjectives, 335 OnError: config.OnError, 336 } 337 } 338 339 func buildTallyTimerHistogramBuckets( 340 clientConfig *ClientConfig, 341 config *PrometheusConfig, 342 ) []prometheus.HistogramObjective { 343 if len(config.DefaultHistogramBuckets) > 0 { 344 result := make([]prometheus.HistogramObjective, len(config.DefaultHistogramBuckets)) 345 for i, item := range config.DefaultHistogramBuckets { 346 result[i].Upper = item.Upper 347 } 348 return result 349 } 350 351 if len(config.DefaultHistogramBoundaries) > 0 { 352 result := make([]prometheus.HistogramObjective, 0, len(config.DefaultHistogramBoundaries)) 353 for _, value := range config.DefaultHistogramBoundaries { 354 result = append(result, prometheus.HistogramObjective{ 355 Upper: value, 356 }) 357 } 358 return result 359 } 360 361 boundaries := clientConfig.PerUnitHistogramBoundaries[Milliseconds] 362 result := make([]prometheus.HistogramObjective, 0, len(boundaries)) 363 for _, boundary := range boundaries { 364 result = append(result, prometheus.HistogramObjective{ 365 Upper: boundary / float64(time.Second/time.Millisecond), // convert milliseconds to seconds 366 }) 367 } 368 return result 369 } 370 371 func setDefaultPerUnitHistogramBoundaries(clientConfig *ClientConfig) { 372 buckets := maps.Clone(defaultPerUnitHistogramBoundaries) 373 374 // In config, when overwrite default buckets, we use [dimensionless / miliseconds / bytes] as keys. 375 // But in code, we use [1 / ms / By] as key (to align with otel unit definition). So we do conversion here. 376 if bucket, ok := clientConfig.PerUnitHistogramBoundaries[UnitNameDimensionless]; ok { 377 buckets[Dimensionless] = bucket 378 } 379 if bucket, ok := clientConfig.PerUnitHistogramBoundaries[UnitNameMilliseconds]; ok { 380 buckets[Milliseconds] = bucket 381 } 382 if bucket, ok := clientConfig.PerUnitHistogramBoundaries[UnitNameBytes]; ok { 383 buckets[Bytes] = bucket 384 } 385 386 clientConfig.PerUnitHistogramBoundaries = buckets 387 } 388 389 // newStatsdScope returns a new statsd scope with 390 // a default reporting interval of a second 391 func newStatsdScope(logger log.Logger, c *Config) tally.Scope { 392 config := c.Statsd 393 if len(config.HostPort) == 0 { 394 return tally.NoopScope 395 } 396 statter, err := statsd.NewClientWithConfig(&statsd.ClientConfig{ 397 Address: config.HostPort, 398 Prefix: config.Prefix, 399 FlushInterval: config.FlushInterval, 400 FlushBytes: config.FlushBytes, 401 }) 402 if err != nil { 403 logger.Fatal("error creating statsd client", tag.Error(err)) 404 } 405 // NOTE: according to (https://github.com/uber-go/tally) Tally's statsd implementation doesn't support tagging. 406 // Therefore, we implement Tally interface to have a statsd reporter that can support tagging 407 opts := statsdreporter.Options{ 408 TagSeparator: c.Statsd.Reporter.TagSeparator, 409 } 410 reporter := statsdreporter.NewReporter(statter, opts) 411 scopeOpts := tally.ScopeOptions{ 412 Tags: c.Tags, 413 Reporter: reporter, 414 Prefix: c.Prefix, 415 } 416 scope, _ := tally.NewRootScope(scopeOpts, time.Second) 417 return scope 418 } 419 420 // newPrometheusScope returns a new prometheus scope with 421 // a default reporting interval of a second 422 func newPrometheusScope( 423 logger log.Logger, 424 config *prometheus.Configuration, 425 sanitizeOptions tally.SanitizeOptions, 426 clientConfig *ClientConfig, 427 ) tally.Scope { 428 reporter, err := config.NewReporter( 429 prometheus.ConfigurationOptions{ 430 Registry: prom.NewRegistry(), 431 OnError: func(err error) { 432 logger.Warn("error in prometheus reporter", tag.Error(err)) 433 }, 434 }, 435 ) 436 if err != nil { 437 logger.Fatal("error creating prometheus reporter", tag.Error(err)) 438 } 439 scopeOpts := tally.ScopeOptions{ 440 Tags: clientConfig.Tags, 441 CachedReporter: reporter, 442 Separator: prometheus.DefaultSeparator, 443 SanitizeOptions: &sanitizeOptions, 444 Prefix: clientConfig.Prefix, 445 } 446 scope, _ := tally.NewRootScope(scopeOpts, time.Second) 447 return scope 448 } 449 450 // MetricsHandlerFromConfig is used at startup to construct a MetricsHandler 451 func MetricsHandlerFromConfig(logger log.Logger, c *Config) (Handler, error) { 452 if c == nil { 453 return NoopMetricsHandler, nil 454 } 455 456 setDefaultPerUnitHistogramBoundaries(&c.ClientConfig) 457 458 if c.Prometheus != nil && c.Prometheus.Framework == FrameworkOpentelemetry { 459 otelProvider, err := NewOpenTelemetryProvider(logger, c.Prometheus, &c.ClientConfig) 460 if err != nil { 461 logger.Fatal(err.Error()) 462 } 463 464 return NewOtelMetricsHandler(logger, otelProvider, c.ClientConfig) 465 } 466 467 return NewTallyMetricsHandler( 468 c.ClientConfig, 469 NewScope(logger, c), 470 ), nil 471 } 472 473 func configExcludeTags(cfg ClientConfig) map[string]map[string]struct{} { 474 tagsToFilter := make(map[string]map[string]struct{}) 475 for key, val := range cfg.ExcludeTags { 476 exclusions := make(map[string]struct{}) 477 for _, val := range val { 478 exclusions[val] = struct{}{} 479 } 480 tagsToFilter[key] = exclusions 481 } 482 return tagsToFilter 483 } 484 485 func (s SanitizeRange) toTally() (tally.SanitizeRange, error) { 486 startRangeRunes := []rune(s.StartRange) 487 if len(startRangeRunes) != 1 { 488 return tally.SanitizeRange{}, fmt.Errorf("start range '%+v' must be a single rune", startRangeRunes) 489 } 490 491 endRangeRunes := []rune(s.EndRange) 492 if len(endRangeRunes) != 1 { 493 return tally.SanitizeRange{}, fmt.Errorf("end range '%+v' must be a single rune", endRangeRunes) 494 } 495 496 return tally.SanitizeRange([2]rune{startRangeRunes[0], endRangeRunes[0]}), nil 497 } 498 499 func (v ValidCharacters) toTally() (tally.ValidCharacters, error) { 500 var ranges []tally.SanitizeRange 501 502 for _, r := range v.Ranges { 503 tallyRange, err := r.toTally() 504 if err != nil { 505 return tally.ValidCharacters{}, err 506 } 507 508 ranges = append(ranges, tallyRange) 509 } 510 511 return tally.ValidCharacters{ 512 Ranges: ranges, 513 Characters: []rune(v.SafeCharacters), 514 }, nil 515 } 516 517 func (s SanitizeOptions) toTally() (tally.SanitizeOptions, error) { 518 tallyNameChars, err := s.NameCharacters.toTally() 519 if err != nil { 520 return tally.SanitizeOptions{}, fmt.Errorf("invalid nameChars: %v", err) 521 } 522 523 tallyKeyChars, err := s.KeyCharacters.toTally() 524 if err != nil { 525 return tally.SanitizeOptions{}, fmt.Errorf("invalid keyChars: %v", err) 526 } 527 528 tallyValueChars, err := s.ValueCharacters.toTally() 529 if err != nil { 530 return tally.SanitizeOptions{}, fmt.Errorf("invalid valueChars: %v", err) 531 } 532 533 replacementChars := []rune(s.ReplacementCharacter) 534 if len(replacementChars) != 1 { 535 return tally.SanitizeOptions{}, errors.New("can only specify a single replacement character") 536 } 537 538 return tally.SanitizeOptions{ 539 NameCharacters: tallyNameChars, 540 KeyCharacters: tallyKeyChars, 541 ValueCharacters: tallyValueChars, 542 ReplacementCharacter: replacementChars[0], 543 }, nil 544 }