go.temporal.io/server@v1.23.0/common/metrics/config.go (about)

     1  // The MIT License
     2  //
     3  // Copyright (c) 2020 Temporal Technologies Inc.  All rights reserved.
     4  //
     5  // Copyright (c) 2020 Uber Technologies, Inc.
     6  //
     7  // Permission is hereby granted, free of charge, to any person obtaining a copy
     8  // of this software and associated documentation files (the "Software"), to deal
     9  // in the Software without restriction, including without limitation the rights
    10  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    11  // copies of the Software, and to permit persons to whom the Software is
    12  // furnished to do so, subject to the following conditions:
    13  //
    14  // The above copyright notice and this permission notice shall be included in
    15  // all copies or substantial portions of the Software.
    16  //
    17  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    18  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    19  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    20  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    21  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    22  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    23  // THE SOFTWARE.
    24  
    25  package metrics
    26  
    27  import (
    28  	"errors"
    29  	"fmt"
    30  	"time"
    31  
    32  	"github.com/cactus/go-statsd-client/v5/statsd"
    33  	prom "github.com/prometheus/client_golang/prometheus"
    34  	"github.com/uber-go/tally/v4"
    35  	"github.com/uber-go/tally/v4/m3"
    36  	"github.com/uber-go/tally/v4/prometheus"
    37  	"golang.org/x/exp/maps"
    38  
    39  	"go.temporal.io/server/common/log"
    40  	"go.temporal.io/server/common/log/tag"
    41  	statsdreporter "go.temporal.io/server/common/metrics/tally/statsd"
    42  )
    43  
    44  type (
    45  	// Config contains the config items for metrics subsystem
    46  	Config struct {
    47  		ClientConfig `yaml:"clientConfig,inline"`
    48  
    49  		// M3 is the configuration for m3 metrics reporter
    50  		M3 *m3.Configuration `yaml:"m3"`
    51  		// Statsd is the configuration for statsd reporter
    52  		Statsd *StatsdConfig `yaml:"statsd"`
    53  		// Prometheus is the configuration for prometheus reporter
    54  		Prometheus *PrometheusConfig `yaml:"prometheus"`
    55  	}
    56  
    57  	ClientConfig struct {
    58  		// Tags is the set of key-value pairs to be reported as part of every metric
    59  		Tags map[string]string `yaml:"tags"`
    60  		// ExcludeTags is a map from tag name string to tag values string list.
    61  		// Each value present in keys will have relevant tag value replaced with "_tag_excluded_"
    62  		// Each value in values list will white-list tag values to be reported as usual.
    63  		ExcludeTags map[string][]string `yaml:"excludeTags"`
    64  		// Prefix sets the prefix to all outgoing metrics
    65  		Prefix string `yaml:"prefix"`
    66  
    67  		// DefaultHistogramBoundaries defines the default histogram bucket
    68  		// boundaries.
    69  		// Configuration of histogram boundaries for given metric unit.
    70  		//
    71  		// Supported values:
    72  		// - "dimensionless"
    73  		// - "milliseconds"
    74  		// - "bytes"
    75  		PerUnitHistogramBoundaries map[string][]float64 `yaml:"perUnitHistogramBoundaries"`
    76  	}
    77  
    78  	// StatsdConfig contains the config items for statsd metrics reporter
    79  	StatsdConfig struct {
    80  		// The host and port of the statsd server
    81  		HostPort string `yaml:"hostPort" validate:"nonzero"`
    82  		// The prefix to use in reporting to statsd
    83  		Prefix string `yaml:"prefix" validate:"nonzero"`
    84  		// FlushInterval is the maximum interval for sending packets.
    85  		// If it is not specified, it defaults to 1 second.
    86  		FlushInterval time.Duration `yaml:"flushInterval"`
    87  		// FlushBytes specifies the maximum udp packet size you wish to send.
    88  		// If FlushBytes is unspecified, it defaults  to 1432 bytes, which is
    89  		// considered safe for local traffic.
    90  		FlushBytes int `yaml:"flushBytes"`
    91  		// Reporter allows additional configuration of the stats reporter, e.g. with custom tagging options.
    92  		Reporter StatsdReporterConfig `yaml:"reporter"`
    93  	}
    94  
    95  	StatsdReporterConfig struct {
    96  		// TagSeparator allows tags to be appended with a separator. If not specified tag keys and values
    97  		// are embedded to the stat name directly.
    98  		TagSeparator string `yaml:"tagSeparator"`
    99  	}
   100  
   101  	// PrometheusConfig is a new format for config for prometheus metrics.
   102  	PrometheusConfig struct {
   103  		// Metric framework: Tally/OpenTelemetry
   104  		Framework string `yaml:"framework"`
   105  		// Address for prometheus to serve metrics from.
   106  		ListenAddress string `yaml:"listenAddress"`
   107  
   108  		// HandlerPath if specified will be used instead of using the default
   109  		// HTTP handler path "/metrics".
   110  		HandlerPath string `yaml:"handlerPath"`
   111  
   112  		// Configs below are kept for backwards compatibility with previously exposed tally prometheus.Configuration.
   113  
   114  		// Deprecated. ListenNetwork if specified will be used instead of using tcp network.
   115  		// Supported networks: tcp, tcp4, tcp6 and unix.
   116  		ListenNetwork string `yaml:"listenNetwork"`
   117  
   118  		// Deprecated. TimerType is the default Prometheus type to use for Tally timers.
   119  		// TimerType is always histogram.
   120  		TimerType string `yaml:"timerType"`
   121  
   122  		// Deprecated. Please use PerUnitHistogramBoundaries in ClientConfig.
   123  		// DefaultHistogramBoundaries defines the default histogram bucket boundaries for tally timer metrics.
   124  		DefaultHistogramBoundaries []float64 `yaml:"defaultHistogramBoundaries"`
   125  
   126  		// Deprecated. Please use PerUnitHistogramBoundaries in ClientConfig.
   127  		// DefaultHistogramBuckets if specified will set the default histogram
   128  		// buckets to be used by the reporter for tally timer metrics.
   129  		// The unit for value specified is Second.
   130  		// If specified, will override DefaultSummaryObjectives and PerUnitHistogramBoundaries["milliseconds"].
   131  		DefaultHistogramBuckets []HistogramObjective `yaml:"defaultHistogramBuckets"`
   132  
   133  		// Deprecated. DefaultSummaryObjectives if specified will set the default summary
   134  		// objectives to be used by the reporter.
   135  		// The unit for value specified is Second.
   136  		// If specified, will override PerUnitHistogramBoundaries["milliseconds"].
   137  		DefaultSummaryObjectives []SummaryObjective `yaml:"defaultSummaryObjectives"`
   138  
   139  		// Deprecated. OnError specifies what to do when an error either with listening
   140  		// on the specified listen address or registering a metric with the
   141  		// Prometheus. By default the registerer will panic.
   142  		OnError string `yaml:"onError"`
   143  
   144  		// Deprecated. SanitizeOptions is an optional field that enables a user to
   145  		// specify which characters are valid and/or should be replaced before metrics
   146  		// are emitted.
   147  		SanitizeOptions *SanitizeOptions `yaml:"sanitizeOptions"`
   148  	}
   149  )
   150  
   151  // Deprecated. HistogramObjective is a Prometheus histogram bucket.
   152  // Added for backwards compatibility.
   153  type HistogramObjective struct {
   154  	Upper float64 `yaml:"upper"`
   155  }
   156  
   157  // Deprecated. SummaryObjective is a Prometheus summary objective.
   158  // Added for backwards compatibility.
   159  type SummaryObjective struct {
   160  	Percentile   float64 `yaml:"percentile"`
   161  	AllowedError float64 `yaml:"allowedError"`
   162  }
   163  
   164  type SanitizeRange struct {
   165  	StartRange string `yaml:"startRange"`
   166  	EndRange   string `yaml:"endRange"`
   167  }
   168  
   169  type ValidCharacters struct {
   170  	Ranges         []SanitizeRange `yaml:"ranges"`
   171  	SafeCharacters string          `yaml:"safeChars"`
   172  }
   173  
   174  type SanitizeOptions struct {
   175  	NameCharacters       *ValidCharacters `yaml:"nameChars"`
   176  	KeyCharacters        *ValidCharacters `yaml:"keyChars"`
   177  	ValueCharacters      *ValidCharacters `yaml:"valueChars"`
   178  	ReplacementCharacter string           `yaml:"replacementChar"`
   179  }
   180  
   181  // Supported framework types
   182  const (
   183  	// FrameworkTally tally framework id
   184  	FrameworkTally = "tally"
   185  	// FrameworkOpentelemetry OpenTelemetry framework id
   186  	FrameworkOpentelemetry = "opentelemetry"
   187  )
   188  
   189  // Valid unit name for PerUnitHistogramBoundaries config field
   190  const (
   191  	UnitNameDimensionless = "dimensionless"
   192  	UnitNameMilliseconds  = "milliseconds"
   193  	UnitNameBytes         = "bytes"
   194  )
   195  
   196  // tally sanitizer options that satisfy both Prometheus and M3 restrictions.
   197  // This will rename metrics at the tally emission level, so metrics name we
   198  // use maybe different from what gets emitted. In the current implementation
   199  // it will replace - and . with _
   200  // We should still ensure that the base metrics are prometheus compatible,
   201  // but this is necessary as the same prom client initialization is used by
   202  // our system workflows.
   203  var (
   204  	safeCharacters = []rune{'_'}
   205  
   206  	defaultTallySanitizeOptions = tally.SanitizeOptions{
   207  		NameCharacters: tally.ValidCharacters{
   208  			Ranges:     tally.AlphanumericRange,
   209  			Characters: safeCharacters,
   210  		},
   211  		KeyCharacters: tally.ValidCharacters{
   212  			Ranges:     tally.AlphanumericRange,
   213  			Characters: safeCharacters,
   214  		},
   215  		ValueCharacters: tally.ValidCharacters{
   216  			Ranges:     tally.AlphanumericRange,
   217  			Characters: safeCharacters,
   218  		},
   219  		ReplacementCharacter: tally.DefaultReplacementCharacter,
   220  	}
   221  
   222  	defaultPerUnitHistogramBoundaries = map[string][]float64{
   223  		Dimensionless: {
   224  			1,
   225  			2,
   226  			5,
   227  			10,
   228  			20,
   229  			50,
   230  			100,
   231  			200,
   232  			500,
   233  			1_000,
   234  			2_000,
   235  			5_000,
   236  			10_000,
   237  			20_000,
   238  			50_000,
   239  			100_000,
   240  		},
   241  		Milliseconds: {
   242  			1,
   243  			2,
   244  			5,
   245  			10,
   246  			20,
   247  			50,
   248  			100,
   249  			200,
   250  			500,
   251  			1_000, // 1s
   252  			2_000,
   253  			5_000,
   254  			10_000, // 10s
   255  			20_000,
   256  			50_000,
   257  			100_000, // 100s = 1m40s
   258  			200_000,
   259  			500_000,
   260  			1_000_000, // 1000s = 16m40s
   261  		},
   262  		Bytes: {
   263  			1024,
   264  			2048,
   265  			4096,
   266  			8192,
   267  			16384,
   268  			32768,
   269  			65536,
   270  			131072,
   271  			262144,
   272  			524288,
   273  			1048576,
   274  			2097152,
   275  			4194304,
   276  			8388608,
   277  			16777216,
   278  		},
   279  	}
   280  )
   281  
   282  // NewScope builds a new tally scope for this metrics configuration
   283  //
   284  // If the underlying configuration is valid for multiple reporter types,
   285  // only one of them will be used for reporting.
   286  //
   287  // Current priority order is:
   288  // statsd > prometheus
   289  func NewScope(logger log.Logger, c *Config) tally.Scope {
   290  	if c.Statsd != nil {
   291  		return newStatsdScope(logger, c)
   292  	}
   293  	if c.Prometheus != nil {
   294  		sanitizeOptions, err := convertSanitizeOptionsToTally(c.Prometheus)
   295  		if err != nil {
   296  			logger.Fatal("invalid sanitize options input on prometheus config", tag.Error(err))
   297  			return nil
   298  		}
   299  
   300  		return newPrometheusScope(
   301  			logger,
   302  			convertPrometheusConfigToTally(&c.ClientConfig, c.Prometheus),
   303  			sanitizeOptions,
   304  			&c.ClientConfig,
   305  		)
   306  	}
   307  	return tally.NoopScope
   308  }
   309  
   310  func convertSanitizeOptionsToTally(config *PrometheusConfig) (tally.SanitizeOptions, error) {
   311  	if config.SanitizeOptions == nil {
   312  		return defaultTallySanitizeOptions, nil
   313  	}
   314  
   315  	return config.SanitizeOptions.toTally()
   316  }
   317  
   318  func convertPrometheusConfigToTally(
   319  	clientConfig *ClientConfig,
   320  	config *PrometheusConfig,
   321  ) *prometheus.Configuration {
   322  	defaultObjectives := make([]prometheus.SummaryObjective, len(config.DefaultSummaryObjectives))
   323  	for i, item := range config.DefaultSummaryObjectives {
   324  		defaultObjectives[i].AllowedError = item.AllowedError
   325  		defaultObjectives[i].Percentile = item.Percentile
   326  	}
   327  
   328  	return &prometheus.Configuration{
   329  		HandlerPath:              config.HandlerPath,
   330  		ListenNetwork:            config.ListenNetwork,
   331  		ListenAddress:            config.ListenAddress,
   332  		TimerType:                "histogram",
   333  		DefaultHistogramBuckets:  buildTallyTimerHistogramBuckets(clientConfig, config),
   334  		DefaultSummaryObjectives: defaultObjectives,
   335  		OnError:                  config.OnError,
   336  	}
   337  }
   338  
   339  func buildTallyTimerHistogramBuckets(
   340  	clientConfig *ClientConfig,
   341  	config *PrometheusConfig,
   342  ) []prometheus.HistogramObjective {
   343  	if len(config.DefaultHistogramBuckets) > 0 {
   344  		result := make([]prometheus.HistogramObjective, len(config.DefaultHistogramBuckets))
   345  		for i, item := range config.DefaultHistogramBuckets {
   346  			result[i].Upper = item.Upper
   347  		}
   348  		return result
   349  	}
   350  
   351  	if len(config.DefaultHistogramBoundaries) > 0 {
   352  		result := make([]prometheus.HistogramObjective, 0, len(config.DefaultHistogramBoundaries))
   353  		for _, value := range config.DefaultHistogramBoundaries {
   354  			result = append(result, prometheus.HistogramObjective{
   355  				Upper: value,
   356  			})
   357  		}
   358  		return result
   359  	}
   360  
   361  	boundaries := clientConfig.PerUnitHistogramBoundaries[Milliseconds]
   362  	result := make([]prometheus.HistogramObjective, 0, len(boundaries))
   363  	for _, boundary := range boundaries {
   364  		result = append(result, prometheus.HistogramObjective{
   365  			Upper: boundary / float64(time.Second/time.Millisecond), // convert milliseconds to seconds
   366  		})
   367  	}
   368  	return result
   369  }
   370  
   371  func setDefaultPerUnitHistogramBoundaries(clientConfig *ClientConfig) {
   372  	buckets := maps.Clone(defaultPerUnitHistogramBoundaries)
   373  
   374  	// In config, when overwrite default buckets, we use [dimensionless / miliseconds / bytes] as keys.
   375  	// But in code, we use [1 / ms / By] as key (to align with otel unit definition). So we do conversion here.
   376  	if bucket, ok := clientConfig.PerUnitHistogramBoundaries[UnitNameDimensionless]; ok {
   377  		buckets[Dimensionless] = bucket
   378  	}
   379  	if bucket, ok := clientConfig.PerUnitHistogramBoundaries[UnitNameMilliseconds]; ok {
   380  		buckets[Milliseconds] = bucket
   381  	}
   382  	if bucket, ok := clientConfig.PerUnitHistogramBoundaries[UnitNameBytes]; ok {
   383  		buckets[Bytes] = bucket
   384  	}
   385  
   386  	clientConfig.PerUnitHistogramBoundaries = buckets
   387  }
   388  
   389  // newStatsdScope returns a new statsd scope with
   390  // a default reporting interval of a second
   391  func newStatsdScope(logger log.Logger, c *Config) tally.Scope {
   392  	config := c.Statsd
   393  	if len(config.HostPort) == 0 {
   394  		return tally.NoopScope
   395  	}
   396  	statter, err := statsd.NewClientWithConfig(&statsd.ClientConfig{
   397  		Address:       config.HostPort,
   398  		Prefix:        config.Prefix,
   399  		FlushInterval: config.FlushInterval,
   400  		FlushBytes:    config.FlushBytes,
   401  	})
   402  	if err != nil {
   403  		logger.Fatal("error creating statsd client", tag.Error(err))
   404  	}
   405  	// NOTE: according to (https://github.com/uber-go/tally) Tally's statsd implementation doesn't support tagging.
   406  	// Therefore, we implement Tally interface to have a statsd reporter that can support tagging
   407  	opts := statsdreporter.Options{
   408  		TagSeparator: c.Statsd.Reporter.TagSeparator,
   409  	}
   410  	reporter := statsdreporter.NewReporter(statter, opts)
   411  	scopeOpts := tally.ScopeOptions{
   412  		Tags:     c.Tags,
   413  		Reporter: reporter,
   414  		Prefix:   c.Prefix,
   415  	}
   416  	scope, _ := tally.NewRootScope(scopeOpts, time.Second)
   417  	return scope
   418  }
   419  
   420  // newPrometheusScope returns a new prometheus scope with
   421  // a default reporting interval of a second
   422  func newPrometheusScope(
   423  	logger log.Logger,
   424  	config *prometheus.Configuration,
   425  	sanitizeOptions tally.SanitizeOptions,
   426  	clientConfig *ClientConfig,
   427  ) tally.Scope {
   428  	reporter, err := config.NewReporter(
   429  		prometheus.ConfigurationOptions{
   430  			Registry: prom.NewRegistry(),
   431  			OnError: func(err error) {
   432  				logger.Warn("error in prometheus reporter", tag.Error(err))
   433  			},
   434  		},
   435  	)
   436  	if err != nil {
   437  		logger.Fatal("error creating prometheus reporter", tag.Error(err))
   438  	}
   439  	scopeOpts := tally.ScopeOptions{
   440  		Tags:            clientConfig.Tags,
   441  		CachedReporter:  reporter,
   442  		Separator:       prometheus.DefaultSeparator,
   443  		SanitizeOptions: &sanitizeOptions,
   444  		Prefix:          clientConfig.Prefix,
   445  	}
   446  	scope, _ := tally.NewRootScope(scopeOpts, time.Second)
   447  	return scope
   448  }
   449  
   450  // MetricsHandlerFromConfig is used at startup to construct a MetricsHandler
   451  func MetricsHandlerFromConfig(logger log.Logger, c *Config) (Handler, error) {
   452  	if c == nil {
   453  		return NoopMetricsHandler, nil
   454  	}
   455  
   456  	setDefaultPerUnitHistogramBoundaries(&c.ClientConfig)
   457  
   458  	if c.Prometheus != nil && c.Prometheus.Framework == FrameworkOpentelemetry {
   459  		otelProvider, err := NewOpenTelemetryProvider(logger, c.Prometheus, &c.ClientConfig)
   460  		if err != nil {
   461  			logger.Fatal(err.Error())
   462  		}
   463  
   464  		return NewOtelMetricsHandler(logger, otelProvider, c.ClientConfig)
   465  	}
   466  
   467  	return NewTallyMetricsHandler(
   468  		c.ClientConfig,
   469  		NewScope(logger, c),
   470  	), nil
   471  }
   472  
   473  func configExcludeTags(cfg ClientConfig) map[string]map[string]struct{} {
   474  	tagsToFilter := make(map[string]map[string]struct{})
   475  	for key, val := range cfg.ExcludeTags {
   476  		exclusions := make(map[string]struct{})
   477  		for _, val := range val {
   478  			exclusions[val] = struct{}{}
   479  		}
   480  		tagsToFilter[key] = exclusions
   481  	}
   482  	return tagsToFilter
   483  }
   484  
   485  func (s SanitizeRange) toTally() (tally.SanitizeRange, error) {
   486  	startRangeRunes := []rune(s.StartRange)
   487  	if len(startRangeRunes) != 1 {
   488  		return tally.SanitizeRange{}, fmt.Errorf("start range '%+v' must be a single rune", startRangeRunes)
   489  	}
   490  
   491  	endRangeRunes := []rune(s.EndRange)
   492  	if len(endRangeRunes) != 1 {
   493  		return tally.SanitizeRange{}, fmt.Errorf("end range '%+v' must be a single rune", endRangeRunes)
   494  	}
   495  
   496  	return tally.SanitizeRange([2]rune{startRangeRunes[0], endRangeRunes[0]}), nil
   497  }
   498  
   499  func (v ValidCharacters) toTally() (tally.ValidCharacters, error) {
   500  	var ranges []tally.SanitizeRange
   501  
   502  	for _, r := range v.Ranges {
   503  		tallyRange, err := r.toTally()
   504  		if err != nil {
   505  			return tally.ValidCharacters{}, err
   506  		}
   507  
   508  		ranges = append(ranges, tallyRange)
   509  	}
   510  
   511  	return tally.ValidCharacters{
   512  		Ranges:     ranges,
   513  		Characters: []rune(v.SafeCharacters),
   514  	}, nil
   515  }
   516  
   517  func (s SanitizeOptions) toTally() (tally.SanitizeOptions, error) {
   518  	tallyNameChars, err := s.NameCharacters.toTally()
   519  	if err != nil {
   520  		return tally.SanitizeOptions{}, fmt.Errorf("invalid nameChars: %v", err)
   521  	}
   522  
   523  	tallyKeyChars, err := s.KeyCharacters.toTally()
   524  	if err != nil {
   525  		return tally.SanitizeOptions{}, fmt.Errorf("invalid keyChars: %v", err)
   526  	}
   527  
   528  	tallyValueChars, err := s.ValueCharacters.toTally()
   529  	if err != nil {
   530  		return tally.SanitizeOptions{}, fmt.Errorf("invalid valueChars: %v", err)
   531  	}
   532  
   533  	replacementChars := []rune(s.ReplacementCharacter)
   534  	if len(replacementChars) != 1 {
   535  		return tally.SanitizeOptions{}, errors.New("can only specify a single replacement character")
   536  	}
   537  
   538  	return tally.SanitizeOptions{
   539  		NameCharacters:       tallyNameChars,
   540  		KeyCharacters:        tallyKeyChars,
   541  		ValueCharacters:      tallyValueChars,
   542  		ReplacementCharacter: replacementChars[0],
   543  	}, nil
   544  }