go.uber.org/yarpc@v1.72.1/internal/observability/graph.go (about)

     1  // Copyright (c) 2022 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package observability
    22  
    23  import (
    24  	"context"
    25  	"sync"
    26  	"time"
    27  
    28  	"go.uber.org/net/metrics"
    29  	"go.uber.org/net/metrics/bucket"
    30  	"go.uber.org/yarpc/api/transport"
    31  	"go.uber.org/yarpc/internal/digester"
    32  	"go.uber.org/zap"
    33  	"go.uber.org/zap/zapcore"
    34  )
    35  
    36  var (
    37  	_timeNow          = time.Now // for tests
    38  	_defaultGraphSize = 128
    39  	// Latency buckets for histograms. At some point, we may want to make these
    40  	// configurable.
    41  	_bucketsMs = bucket.NewRPCLatency()
    42  	// Bytes buckets for payload size histograms, containing exponential buckets
    43  	// in range of 0B, 1B, 2B, ... 256MB.
    44  	_bucketsBytes = append([]int64{0}, bucket.NewExponential(1, 2, 29)...)
    45  )
    46  
    47  type directionName string
    48  
    49  const (
    50  	_directionOutbound directionName = "outbound"
    51  	_directionInbound  directionName = "inbound"
    52  
    53  	// _droppedTagValue represents the value of a metric tag when the tag
    54  	// is being blocked.
    55  	_droppedTagValue = "__dropped__"
    56  
    57  	_source          = "source"
    58  	_dest            = "dest"
    59  	_transport       = "transport"
    60  	_procedure       = "procedure"
    61  	_encoding        = "encoding"
    62  	_routingKey      = "routing_key"
    63  	_routingDelegate = "routing_delegate"
    64  	_direction       = "direction"
    65  	_rpcType         = "rpc_type"
    66  )
    67  
    68  // A graph represents a collection of services: each service is a node, and we
    69  // collect stats for each caller-callee-transport-encoding-procedure-rk-sk-rd edge.
    70  type graph struct {
    71  	meter            *metrics.Scope
    72  	logger           *zap.Logger
    73  	extract          ContextExtractor
    74  	ignoreMetricsTag *metricsTagIgnore
    75  
    76  	edgesMu sync.RWMutex
    77  	edges   map[string]*edge
    78  
    79  	inboundLevels, outboundLevels levels
    80  }
    81  
    82  // if the field is set to true, the metrics tag won't be emitted
    83  // it is a more performant way than have a map (field lookup versus map lookup)
    84  type metricsTagIgnore struct {
    85  	source          bool
    86  	dest            bool
    87  	transport       bool
    88  	procedure       bool
    89  	encoding        bool
    90  	routingKey      bool
    91  	routingDelegate bool
    92  	direction       bool
    93  	rpcType         bool
    94  }
    95  
    96  func newMetricsTagIgnore(metricTagsIgnore []string) *metricsTagIgnore {
    97  	r := new(metricsTagIgnore)
    98  	for _, m := range metricTagsIgnore {
    99  		switch m {
   100  		case _source:
   101  			r.source = true
   102  		case _dest:
   103  			r.dest = true
   104  		case _transport:
   105  			r.transport = true
   106  		case _procedure:
   107  			r.procedure = true
   108  		case _encoding:
   109  			r.encoding = true
   110  		case _routingKey:
   111  			r.routingKey = true
   112  		case _routingDelegate:
   113  			r.routingDelegate = true
   114  		case _direction:
   115  			r.direction = true
   116  		case _rpcType:
   117  			r.rpcType = true
   118  		}
   119  	}
   120  	return r
   121  }
   122  
   123  func (m *metricsTagIgnore) tags(req *transport.Request, direction string, rpcType transport.Type) metrics.Tags {
   124  	tags := metrics.Tags{
   125  		_source:          req.Caller,
   126  		_dest:            req.Service,
   127  		_transport:       unknownIfEmpty(req.Transport),
   128  		_procedure:       req.Procedure,
   129  		_encoding:        string(req.Encoding),
   130  		_routingKey:      req.RoutingKey,
   131  		_routingDelegate: req.RoutingDelegate,
   132  		_direction:       direction,
   133  		_rpcType:         rpcType.String(),
   134  	}
   135  
   136  	if m.source {
   137  		tags[_source] = _droppedTagValue
   138  	}
   139  	if m.dest {
   140  		tags[_dest] = _droppedTagValue
   141  	}
   142  	if m.transport {
   143  		tags[_transport] = _droppedTagValue
   144  	}
   145  	if m.encoding {
   146  		tags[_encoding] = _droppedTagValue
   147  	}
   148  	if m.procedure {
   149  		tags[_procedure] = _droppedTagValue
   150  	}
   151  	if m.routingKey {
   152  		tags[_routingKey] = _droppedTagValue
   153  	}
   154  	if m.routingDelegate {
   155  		tags[_routingDelegate] = _droppedTagValue
   156  	}
   157  	if m.direction {
   158  		tags[_direction] = _droppedTagValue
   159  	}
   160  	if m.rpcType {
   161  		tags[_rpcType] = _droppedTagValue
   162  	}
   163  
   164  	return tags
   165  }
   166  
   167  func newGraph(meter *metrics.Scope, logger *zap.Logger, extract ContextExtractor, metricTagsIgnore []string) graph {
   168  	return graph{
   169  		edges:            make(map[string]*edge, _defaultGraphSize),
   170  		meter:            meter,
   171  		logger:           logger,
   172  		extract:          extract,
   173  		ignoreMetricsTag: newMetricsTagIgnore(metricTagsIgnore),
   174  		inboundLevels: levels{
   175  			success:          zapcore.DebugLevel,
   176  			failure:          zapcore.ErrorLevel,
   177  			applicationError: zapcore.ErrorLevel,
   178  			serverError:      zapcore.ErrorLevel,
   179  			clientError:      zapcore.ErrorLevel,
   180  		},
   181  		outboundLevels: levels{
   182  			success:          zapcore.DebugLevel,
   183  			failure:          zapcore.ErrorLevel,
   184  			applicationError: zapcore.ErrorLevel,
   185  			serverError:      zapcore.ErrorLevel,
   186  			clientError:      zapcore.ErrorLevel,
   187  		},
   188  	}
   189  }
   190  
   191  // begin starts a call along an edge.
   192  func (g *graph) begin(ctx context.Context, rpcType transport.Type, direction directionName, req *transport.Request) call {
   193  	now := _timeNow()
   194  
   195  	d := digester.New()
   196  	if !g.ignoreMetricsTag.source {
   197  		d.Add(req.Caller)
   198  	}
   199  	if !g.ignoreMetricsTag.dest {
   200  		d.Add(req.Service)
   201  	}
   202  	if !g.ignoreMetricsTag.transport {
   203  		d.Add(req.Transport)
   204  	}
   205  	if !g.ignoreMetricsTag.encoding {
   206  		d.Add(string(req.Encoding))
   207  	}
   208  	if !g.ignoreMetricsTag.procedure {
   209  		d.Add(req.Procedure)
   210  	}
   211  	if !g.ignoreMetricsTag.routingKey {
   212  		d.Add(req.RoutingKey)
   213  	}
   214  	if !g.ignoreMetricsTag.routingDelegate {
   215  		d.Add(req.RoutingDelegate)
   216  	}
   217  	if !g.ignoreMetricsTag.direction {
   218  		d.Add(string(direction))
   219  	}
   220  	if !g.ignoreMetricsTag.rpcType {
   221  		d.Add(rpcType.String())
   222  	}
   223  	e := g.getOrCreateEdge(d.Digest(), req, string(direction), rpcType)
   224  	d.Free()
   225  
   226  	levels := &g.inboundLevels
   227  	if direction != _directionInbound {
   228  		levels = &g.outboundLevels
   229  	}
   230  
   231  	return call{
   232  		edge:      e,
   233  		extract:   g.extract,
   234  		started:   now,
   235  		ctx:       ctx,
   236  		req:       req,
   237  		rpcType:   rpcType,
   238  		direction: direction,
   239  		levels:    levels,
   240  	}
   241  }
   242  
   243  func (g *graph) getOrCreateEdge(key []byte, req *transport.Request, direction string, rpcType transport.Type) *edge {
   244  	if e := g.getEdge(key); e != nil {
   245  		return e
   246  	}
   247  	return g.createEdge(key, req, direction, rpcType)
   248  }
   249  
   250  func (g *graph) getEdge(key []byte) *edge {
   251  	g.edgesMu.RLock()
   252  	e := g.edges[string(key)]
   253  	g.edgesMu.RUnlock()
   254  	return e
   255  }
   256  
   257  func (g *graph) createEdge(key []byte, req *transport.Request, direction string, rpcType transport.Type) *edge {
   258  	g.edgesMu.Lock()
   259  	// Since we'll rarely hit this code path, the overhead of defer is acceptable.
   260  	defer g.edgesMu.Unlock()
   261  
   262  	if e, ok := g.edges[string(key)]; ok {
   263  		// Someone beat us to the punch.
   264  		return e
   265  	}
   266  
   267  	e := newEdge(g.logger, g.meter, g.ignoreMetricsTag, req, direction, rpcType)
   268  	g.edges[string(key)] = e
   269  	return e
   270  }
   271  
   272  // An edge is a collection of RPC stats for a particular
   273  // caller-callee-encoding-procedure-sk-rd-rk edge in the service graph.
   274  type edge struct {
   275  	logger *zap.Logger
   276  
   277  	calls          *metrics.Counter
   278  	successes      *metrics.Counter
   279  	panics         *metrics.Counter
   280  	callerFailures *metrics.CounterVector
   281  	serverFailures *metrics.CounterVector
   282  
   283  	latencies            *metrics.Histogram
   284  	callerErrLatencies   *metrics.Histogram
   285  	serverErrLatencies   *metrics.Histogram
   286  	ttls                 *metrics.Histogram
   287  	timeoutTtls          *metrics.Histogram
   288  	requestPayloadSizes  *metrics.Histogram
   289  	responsePayloadSizes *metrics.Histogram
   290  	streaming            *streamEdge
   291  }
   292  
   293  // streamEdge metrics should only be used for streaming requests.
   294  type streamEdge struct {
   295  	sends         *metrics.Counter
   296  	sendSuccesses *metrics.Counter
   297  	sendFailures  *metrics.CounterVector
   298  
   299  	receives         *metrics.Counter
   300  	receiveSuccesses *metrics.Counter
   301  	receiveFailures  *metrics.CounterVector
   302  
   303  	streamDurations            *metrics.Histogram
   304  	streamRequestPayloadSizes  *metrics.Histogram
   305  	streamResponsePayloadSizes *metrics.Histogram
   306  
   307  	streamsActive *metrics.Gauge
   308  }
   309  
   310  // newEdge constructs a new edge. Since Registries enforce metric uniqueness,
   311  // edges should be cached and re-used for each RPC.
   312  func newEdge(logger *zap.Logger, meter *metrics.Scope, tagToIgnore *metricsTagIgnore, req *transport.Request, direction string, rpcType transport.Type) *edge {
   313  	tags := tagToIgnore.tags(req, direction, rpcType)
   314  
   315  	// metrics for all RPCs
   316  	calls, err := meter.Counter(metrics.Spec{
   317  		Name:      "calls",
   318  		Help:      "Total number of RPCs.",
   319  		ConstTags: tags,
   320  	})
   321  	if err != nil {
   322  		logger.Error("Failed to create calls counter.", zap.Error(err))
   323  	}
   324  	successes, err := meter.Counter(metrics.Spec{
   325  		Name:      "successes",
   326  		Help:      "Number of successful RPCs.",
   327  		ConstTags: tags,
   328  	})
   329  	if err != nil {
   330  		logger.Error("Failed to create successes counter.", zap.Error(err))
   331  	}
   332  	panics, err := meter.Counter(metrics.Spec{
   333  		Name:      "panics",
   334  		Help:      "Number of RPCs failed because of panic.",
   335  		ConstTags: tags,
   336  	})
   337  	if err != nil {
   338  		logger.Error("Failed to create panics counter.", zap.Error(err))
   339  	}
   340  	callerFailures, err := meter.CounterVector(metrics.Spec{
   341  		Name:      "caller_failures",
   342  		Help:      "Number of RPCs failed because of caller error.",
   343  		ConstTags: tags,
   344  		VarTags:   []string{_error, _errorNameMetricsKey},
   345  	})
   346  	if err != nil {
   347  		logger.Error("Failed to create caller failures vector.", zap.Error(err))
   348  	}
   349  	serverFailures, err := meter.CounterVector(metrics.Spec{
   350  		Name:      "server_failures",
   351  		Help:      "Number of RPCs failed because of server error.",
   352  		ConstTags: tags,
   353  		VarTags:   []string{_error, _errorNameMetricsKey},
   354  	})
   355  	if err != nil {
   356  		logger.Error("Failed to create server failures vector.", zap.Error(err))
   357  	}
   358  
   359  	// metrics for only unary and oneway
   360  	var latencies, callerErrLatencies, serverErrLatencies, ttls, timeoutTtls,
   361  		requestPayloadSizes, responsePayloadSizes *metrics.Histogram
   362  	if rpcType == transport.Unary || rpcType == transport.Oneway {
   363  		latencies, err = meter.Histogram(metrics.HistogramSpec{
   364  			Spec: metrics.Spec{
   365  				Name:      "success_latency_ms",
   366  				Help:      "Latency distribution of successful RPCs.",
   367  				ConstTags: tags,
   368  			},
   369  			Unit:    time.Millisecond,
   370  			Buckets: _bucketsMs,
   371  		})
   372  		if err != nil {
   373  			logger.Error("Failed to create success latency distribution.", zap.Error(err))
   374  		}
   375  		callerErrLatencies, err = meter.Histogram(metrics.HistogramSpec{
   376  			Spec: metrics.Spec{
   377  				Name:      "caller_failure_latency_ms",
   378  				Help:      "Latency distribution of RPCs failed because of caller error.",
   379  				ConstTags: tags,
   380  			},
   381  			Unit:    time.Millisecond,
   382  			Buckets: _bucketsMs,
   383  		})
   384  		if err != nil {
   385  			logger.Error("Failed to create caller failure latency distribution.", zap.Error(err))
   386  		}
   387  		serverErrLatencies, err = meter.Histogram(metrics.HistogramSpec{
   388  			Spec: metrics.Spec{
   389  				Name:      "server_failure_latency_ms",
   390  				Help:      "Latency distribution of RPCs failed because of server error.",
   391  				ConstTags: tags,
   392  			},
   393  			Unit:    time.Millisecond,
   394  			Buckets: _bucketsMs,
   395  		})
   396  		if err != nil {
   397  			logger.Error("Failed to create server failure latency distribution.", zap.Error(err))
   398  		}
   399  		ttls, err = meter.Histogram(metrics.HistogramSpec{
   400  			Spec: metrics.Spec{
   401  				Name:      "ttl_ms",
   402  				Help:      "TTL distribution of the RPCs passed by the caller",
   403  				ConstTags: tags,
   404  			},
   405  			Unit:    time.Millisecond,
   406  			Buckets: _bucketsMs,
   407  		})
   408  		if err != nil {
   409  			logger.Error("Failed to create ttl distribution.", zap.Error(err))
   410  		}
   411  		timeoutTtls, err = meter.Histogram(metrics.HistogramSpec{
   412  			Spec: metrics.Spec{
   413  				Name:      "timeout_ttl_ms",
   414  				Help:      "TTL distribution of the RPCs passed by caller which failed due to timeout",
   415  				ConstTags: tags,
   416  			},
   417  			Unit:    time.Millisecond,
   418  			Buckets: _bucketsMs,
   419  		})
   420  		if err != nil {
   421  			logger.Error("Failed to create timeout ttl distribution.", zap.Error(err))
   422  		}
   423  		requestPayloadSizes, err = meter.Histogram(metrics.HistogramSpec{
   424  			Spec: metrics.Spec{
   425  				Name:      "request_payload_size_bytes",
   426  				Help:      "Request payload size distribution of the RPCs in bytes",
   427  				ConstTags: tags,
   428  			},
   429  			Unit:    time.Millisecond, // Unit is relevent for this histogram
   430  			Buckets: _bucketsBytes,
   431  		})
   432  		if err != nil {
   433  			logger.Error("Failed to create request payload size histogram.", zap.Error(err))
   434  		}
   435  		responsePayloadSizes, err = meter.Histogram(metrics.HistogramSpec{
   436  			Spec: metrics.Spec{
   437  				Name:      "response_payload_size_bytes",
   438  				Help:      "Response payload size distribution of the RPCs in bytes",
   439  				ConstTags: tags,
   440  			},
   441  			Unit:    time.Millisecond, // Unit is relevent for this histogram
   442  			Buckets: _bucketsBytes,
   443  		})
   444  		if err != nil {
   445  			logger.Error("Failed to create response payload size histogram.", zap.Error(err))
   446  		}
   447  	}
   448  
   449  	// metrics for only streams
   450  	var streaming *streamEdge
   451  	if rpcType == transport.Streaming {
   452  		// sends
   453  		sends, err := meter.Counter(metrics.Spec{
   454  			Name:      "stream_sends",
   455  			Help:      "Total number of streaming messages sent.",
   456  			ConstTags: tags,
   457  		})
   458  		if err != nil {
   459  			logger.DPanic("Failed to create streaming sends counter.", zap.Error(err))
   460  		}
   461  		sendSuccesses, err := meter.Counter(metrics.Spec{
   462  			Name:      "stream_send_successes",
   463  			Help:      "Number of successful streaming messages sent.",
   464  			ConstTags: tags,
   465  		})
   466  		if err != nil {
   467  			logger.DPanic("Failed to create streaming sends successes counter.", zap.Error(err))
   468  		}
   469  		sendFailures, err := meter.CounterVector(metrics.Spec{
   470  			Name:      "stream_send_failures",
   471  			Help:      "Number streaming messages that failed to send.",
   472  			ConstTags: tags,
   473  			VarTags:   []string{_error},
   474  		})
   475  		if err != nil {
   476  			logger.DPanic("Failed to create streaming sends failure counter.", zap.Error(err))
   477  		}
   478  
   479  		// receives
   480  		receives, err := meter.Counter(metrics.Spec{
   481  			Name:      "stream_receives",
   482  			Help:      "Total number of streaming messages recevied.",
   483  			ConstTags: tags,
   484  		})
   485  		if err != nil {
   486  			logger.DPanic("Failed to create streaming receives counter.", zap.Error(err))
   487  		}
   488  		receiveSuccesses, err := meter.Counter(metrics.Spec{
   489  			Name:      "stream_receive_successes",
   490  			Help:      "Number of successful streaming messages received.",
   491  			ConstTags: tags,
   492  		})
   493  		if err != nil {
   494  			logger.DPanic("Failed to create streaming receives successes counter.", zap.Error(err))
   495  		}
   496  		receiveFailures, err := meter.CounterVector(metrics.Spec{
   497  			Name:      "stream_receive_failures",
   498  			Help:      "Number streaming messages failed to be recieved.",
   499  			ConstTags: tags,
   500  			VarTags:   []string{_error},
   501  		})
   502  		if err != nil {
   503  			logger.DPanic("Failed to create streaming receives failure counter.", zap.Error(err))
   504  		}
   505  
   506  		// entire stream
   507  		streamDurations, err := meter.Histogram(metrics.HistogramSpec{
   508  			Spec: metrics.Spec{
   509  				Name:      "stream_duration_ms",
   510  				Help:      "Latency distribution of total stream duration.",
   511  				ConstTags: tags,
   512  			},
   513  			Unit:    time.Millisecond,
   514  			Buckets: _bucketsMs,
   515  		})
   516  		if err != nil {
   517  			logger.DPanic("Failed to create stream duration histogram.", zap.Error(err))
   518  		}
   519  
   520  		streamRequestPayloadSizes, err := meter.Histogram(metrics.HistogramSpec{
   521  			Spec: metrics.Spec{
   522  				Name:      "stream_request_payload_size_bytes",
   523  				Help:      "Stream request payload size distribution",
   524  				ConstTags: tags,
   525  			},
   526  			Unit:    time.Millisecond,
   527  			Buckets: _bucketsBytes,
   528  		})
   529  		if err != nil {
   530  			logger.DPanic("Failed to create stream request payload size histogram", zap.Error(err))
   531  		}
   532  
   533  		streamResponsePayloadSizes, err := meter.Histogram(metrics.HistogramSpec{
   534  			Spec: metrics.Spec{
   535  				Name:      "stream_response_payload_size_bytes",
   536  				Help:      "Stream response payload size distribution",
   537  				ConstTags: tags,
   538  			},
   539  			Unit:    time.Millisecond,
   540  			Buckets: _bucketsBytes,
   541  		})
   542  		if err != nil {
   543  			logger.DPanic("Failed to create stream response payload size histogram", zap.Error(err))
   544  		}
   545  
   546  		streamsActive, err := meter.Gauge(metrics.Spec{
   547  			Name:      "streams_active",
   548  			Help:      "Number of active streams.",
   549  			ConstTags: tags,
   550  		})
   551  		if err != nil {
   552  			logger.DPanic("Failed to create active stream gauge.", zap.Error(err))
   553  		}
   554  
   555  		streaming = &streamEdge{
   556  			sends:            sends,
   557  			sendSuccesses:    sendSuccesses,
   558  			sendFailures:     sendFailures,
   559  			receives:         receives,
   560  			receiveSuccesses: receiveSuccesses,
   561  			receiveFailures:  receiveFailures,
   562  
   563  			streamDurations:            streamDurations,
   564  			streamRequestPayloadSizes:  streamRequestPayloadSizes,
   565  			streamResponsePayloadSizes: streamResponsePayloadSizes,
   566  
   567  			streamsActive: streamsActive,
   568  		}
   569  	}
   570  
   571  	logger = logger.With(
   572  		zap.String("source", req.Caller),
   573  		zap.String("dest", req.Service),
   574  		zap.String("transport", unknownIfEmpty(req.Transport)),
   575  		zap.String("procedure", req.Procedure),
   576  		zap.String("encoding", string(req.Encoding)),
   577  		zap.String("routingKey", req.RoutingKey),
   578  		zap.String("routingDelegate", req.RoutingDelegate),
   579  		zap.String("direction", direction),
   580  	)
   581  	return &edge{
   582  		logger:               logger,
   583  		calls:                calls,
   584  		successes:            successes,
   585  		panics:               panics,
   586  		callerFailures:       callerFailures,
   587  		serverFailures:       serverFailures,
   588  		requestPayloadSizes:  requestPayloadSizes,
   589  		responsePayloadSizes: responsePayloadSizes,
   590  		latencies:            latencies,
   591  		callerErrLatencies:   callerErrLatencies,
   592  		serverErrLatencies:   serverErrLatencies,
   593  		ttls:                 ttls,
   594  		timeoutTtls:          timeoutTtls,
   595  		streaming:            streaming,
   596  	}
   597  }
   598  
   599  // unknownIfEmpty works around hard-coded default value of "default" in go.uber.org/net/metrics
   600  func unknownIfEmpty(t string) string {
   601  	if t == "" {
   602  		t = "unknown"
   603  	}
   604  	return t
   605  }