go.uber.org/yarpc@v1.72.1/internal/observability/call.go (about)

     1  // Copyright (c) 2022 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package observability
    22  
    23  import (
    24  	"context"
    25  	"fmt"
    26  	"time"
    27  
    28  	"go.uber.org/yarpc/api/transport"
    29  	"go.uber.org/yarpc/yarpcerrors"
    30  	"go.uber.org/zap"
    31  	"go.uber.org/zap/zapcore"
    32  )
    33  
    34  const (
    35  	_error = "error"
    36  
    37  	_errorNameMetricsKey = "error_name"
    38  	_notSet              = "__not_set__"
    39  
    40  	_errorNameLogKey    = "errorName"
    41  	_errorCodeLogKey    = "errorCode"
    42  	_errorDetailsLogKey = "errorDetails"
    43  
    44  	_successfulInbound  = "Handled inbound request."
    45  	_successfulOutbound = "Made outbound call."
    46  	_errorInbound       = "Error handling inbound request."
    47  	_errorOutbound      = "Error making outbound call."
    48  
    49  	_successStreamOpen  = "Successfully created stream"
    50  	_successStreamClose = "Successfully closed stream"
    51  	_errorStreamOpen    = "Error creating stream"
    52  	_errorStreamClose   = "Error closing stream"
    53  
    54  	_dropped           = "dropped"
    55  	_droppedAppErrLog  = "dropped application error due to context timeout or cancelation"
    56  	_droppedErrLogFmt  = "dropped error due to context timeout or cancelation: %v"
    57  	_droppedSuccessLog = "dropped handler success due to context timeout or cancelation"
    58  )
    59  
    60  // A call represents a single RPC along an edge.
    61  //
    62  // To prevent allocating on the heap on the request path, it's a value instead
    63  // of a pointer.
    64  type call struct {
    65  	edge    *edge
    66  	extract ContextExtractor
    67  
    68  	started   time.Time
    69  	ctx       context.Context
    70  	req       *transport.Request
    71  	rpcType   transport.Type
    72  	direction directionName
    73  
    74  	levels *levels
    75  }
    76  
    77  type callResult struct {
    78  	err            error
    79  	ctxOverrideErr error
    80  
    81  	isApplicationError   bool
    82  	applicationErrorMeta *transport.ApplicationErrorMeta
    83  
    84  	requestSize  int
    85  	responseSize int
    86  }
    87  
    88  type levels struct {
    89  	success, failure, applicationError, serverError, clientError zapcore.Level
    90  
    91  	// useApplicationErrorFailureLevels is used to know which levels should be
    92  	// used between applicationError/failure and clientError/serverError.
    93  	// by default serverError and clientError will be used.
    94  	// useApplicationErrorFailureLevels should be set to true if applicationError
    95  	// or failure are set.
    96  	useApplicationErrorFailureLevels bool
    97  }
    98  
    99  func (c call) End(res callResult) {
   100  	c.endWithAppError(res)
   101  }
   102  
   103  func (c call) EndCallWithAppError(res callResult) {
   104  	c.endWithAppError(res)
   105  }
   106  
   107  func (c call) EndHandleWithAppError(res callResult) {
   108  	if res.ctxOverrideErr == nil {
   109  		c.endWithAppError(res)
   110  		return
   111  	}
   112  
   113  	// We'll override the user's response with the appropriate context error. Also, log
   114  	// the dropped response.
   115  	var droppedField zap.Field
   116  	if res.isApplicationError && res.err == nil { // Thrift exceptions
   117  		droppedField = zap.String(_dropped, _droppedAppErrLog)
   118  	} else if res.err != nil { // other errors
   119  		droppedField = zap.String(_dropped, fmt.Sprintf(_droppedErrLogFmt, res.err))
   120  	} else {
   121  		droppedField = zap.String(_dropped, _droppedSuccessLog)
   122  	}
   123  
   124  	c.endWithAppError(
   125  		callResult{
   126  			err:          res.ctxOverrideErr,
   127  			requestSize:  res.requestSize,
   128  			responseSize: res.responseSize,
   129  		},
   130  		droppedField,
   131  	)
   132  }
   133  
   134  func (c call) endWithAppError(
   135  	res callResult,
   136  	extraLogFields ...zap.Field) {
   137  	elapsed := _timeNow().Sub(c.started)
   138  	c.endLogs(elapsed, res.err, res.isApplicationError, res.applicationErrorMeta, extraLogFields...)
   139  	c.endStats(elapsed, res)
   140  }
   141  
   142  // EndWithPanic ends the call with additional panic metrics
   143  func (c call) EndWithPanic(err error) {
   144  	c.edge.panics.Inc()
   145  	c.endWithAppError(callResult{err: err})
   146  }
   147  
   148  func (c call) endLogs(
   149  	elapsed time.Duration,
   150  	err error,
   151  	isApplicationError bool,
   152  	applicationErrorMeta *transport.ApplicationErrorMeta,
   153  	extraLogFields ...zap.Field) {
   154  	appErrBitWithNoError := isApplicationError && err == nil // ie Thrift exception
   155  
   156  	var ce *zapcore.CheckedEntry
   157  	if err == nil && !isApplicationError {
   158  		msg := _successfulInbound
   159  		if c.direction != _directionInbound {
   160  			msg = _successfulOutbound
   161  		}
   162  		ce = c.edge.logger.Check(c.levels.success, msg)
   163  	} else {
   164  		msg := _errorInbound
   165  		if c.direction != _directionInbound {
   166  			msg = _errorOutbound
   167  		}
   168  
   169  		var lvl zapcore.Level
   170  		// use applicationError and failure logging levels
   171  		// this is deprecated and will only be used by yarpc service
   172  		// which have added configuration for loggers.
   173  		if c.levels.useApplicationErrorFailureLevels {
   174  			lvl = c.levels.failure
   175  
   176  			// For logging purposes, application errors are
   177  			//  - Thrift exceptions (appErrBitWithNoError == true)
   178  			//  - `yarpcerror`s with error details (ie created with `encoding/protobuf.NewError`)
   179  			//
   180  			// This will be the least surprising behavior for users migrating from
   181  			// Thrift exceptions to Protobuf error details.
   182  			//
   183  			// Unfortunately, all errors returned from a Protobuf handler are marked as
   184  			// an application error on the 'transport.ResponseWriter'. Therefore, we
   185  			// distinguish an application error from a regular error by inspecting if an
   186  			// error detail was set.
   187  			//
   188  			// https://github.com/yarpc/yarpc-go/pull/1912
   189  			hasErrDetails := len(yarpcerrors.FromError(err).Details()) > 0
   190  			if appErrBitWithNoError || (isApplicationError && hasErrDetails) {
   191  				lvl = c.levels.applicationError
   192  			}
   193  		} else {
   194  			var code *yarpcerrors.Code
   195  			lvl = c.levels.serverError
   196  
   197  			if appErrBitWithNoError { // thrift exception
   198  				if applicationErrorMeta != nil && applicationErrorMeta.Code != nil { // thrift exception with rpc.code
   199  					code = applicationErrorMeta.Code
   200  				} else {
   201  					lvl = c.levels.clientError
   202  				}
   203  			}
   204  
   205  			if err != nil { // tchannel/HTTP/gRPC errors
   206  				c := yarpcerrors.FromError(err).Code()
   207  				code = &c
   208  			}
   209  
   210  			if code != nil {
   211  				if fault := yarpcerrors.GetFaultTypeFromCode(*code); fault == yarpcerrors.ClientFault {
   212  					lvl = c.levels.clientError
   213  				}
   214  			}
   215  		}
   216  
   217  		ce = c.edge.logger.Check(lvl, msg)
   218  	}
   219  
   220  	if ce == nil {
   221  		return
   222  	}
   223  
   224  	fields := make([]zapcore.Field, 0, 9+len(extraLogFields))
   225  	fields = append(fields, zap.String("rpcType", c.rpcType.String()))
   226  	fields = append(fields, zap.Duration("latency", elapsed))
   227  	fields = append(fields, zap.Bool("successful", err == nil && !isApplicationError))
   228  	fields = append(fields, c.extract(c.ctx))
   229  	if deadlineTime, ok := c.ctx.Deadline(); ok {
   230  		fields = append(fields, zap.Duration("timeout", deadlineTime.Sub(c.started)))
   231  	}
   232  
   233  	if appErrBitWithNoError { // Thrift exception
   234  		fields = append(fields, zap.String(_error, "application_error"))
   235  		if applicationErrorMeta != nil {
   236  			if applicationErrorMeta.Code != nil {
   237  				fields = append(fields, zap.String(_errorCodeLogKey, applicationErrorMeta.Code.String()))
   238  			}
   239  			if applicationErrorMeta.Name != "" {
   240  				fields = append(fields, zap.String(_errorNameLogKey, applicationErrorMeta.Name))
   241  			}
   242  			if applicationErrorMeta.Details != "" {
   243  				fields = append(fields, zap.String(_errorDetailsLogKey, applicationErrorMeta.Details))
   244  			}
   245  		}
   246  
   247  	} else if isApplicationError { // Protobuf error
   248  		fields = append(fields, zap.Error(err))
   249  		fields = append(fields, zap.String(_errorCodeLogKey, yarpcerrors.FromError(err).Code().String()))
   250  		if applicationErrorMeta != nil {
   251  			// ignore transport.ApplicationErrorMeta#Code, since we should get this
   252  			// directly from the error
   253  			if applicationErrorMeta.Name != "" {
   254  				fields = append(fields, zap.String(_errorNameLogKey, applicationErrorMeta.Name))
   255  			}
   256  			if applicationErrorMeta.Details != "" {
   257  				fields = append(fields, zap.String(_errorDetailsLogKey, applicationErrorMeta.Details))
   258  			}
   259  		}
   260  
   261  	} else if err != nil { // unknown error
   262  		fields = append(fields, zap.Error(err))
   263  		fields = append(fields, zap.String(_errorCodeLogKey, yarpcerrors.FromError(err).Code().String()))
   264  	}
   265  
   266  	fields = append(fields, extraLogFields...)
   267  	ce.Write(fields...)
   268  }
   269  
   270  func (c call) endStats(
   271  	elapsed time.Duration,
   272  	res callResult,
   273  ) {
   274  	c.edge.calls.Inc()
   275  
   276  	if deadlineTime, ok := c.ctx.Deadline(); ok {
   277  		c.edge.ttls.Observe(deadlineTime.Sub(c.started))
   278  	}
   279  
   280  	c.edge.requestPayloadSizes.IncBucket(int64(res.requestSize))
   281  
   282  	if res.err == nil && !res.isApplicationError {
   283  		c.edge.successes.Inc()
   284  		c.edge.latencies.Observe(elapsed)
   285  
   286  		if c.rpcType == transport.Unary {
   287  			c.edge.responsePayloadSizes.IncBucket(int64(res.responseSize))
   288  		}
   289  		return
   290  	}
   291  
   292  	appErrorName := _notSet
   293  	if res.applicationErrorMeta != nil && res.applicationErrorMeta.Name != "" {
   294  		appErrorName = res.applicationErrorMeta.Name
   295  	}
   296  
   297  	if yarpcerrors.IsStatus(res.err) {
   298  		status := yarpcerrors.FromError(res.err)
   299  		errCode := status.Code()
   300  		c.endStatsFromFault(elapsed, errCode, appErrorName)
   301  		return
   302  	}
   303  
   304  	if res.isApplicationError {
   305  		if res.applicationErrorMeta != nil && res.applicationErrorMeta.Code != nil {
   306  			c.endStatsFromFault(elapsed, *res.applicationErrorMeta.Code, appErrorName)
   307  			return
   308  		}
   309  
   310  		// It is an application error but not a Status and no YARPC Code is found.
   311  		// Assume it's a caller's fault and emit generic error data.
   312  		c.edge.callerErrLatencies.Observe(elapsed)
   313  
   314  		if counter, err := c.edge.callerFailures.Get(
   315  			_error, "application_error",
   316  			_errorNameMetricsKey, appErrorName,
   317  		); err == nil {
   318  			counter.Inc()
   319  		}
   320  		return
   321  	}
   322  
   323  	c.edge.serverErrLatencies.Observe(elapsed)
   324  	if counter, err := c.edge.serverFailures.Get(
   325  		_error, "unknown_internal_yarpc",
   326  		_errorNameMetricsKey, _notSet,
   327  	); err == nil {
   328  		counter.Inc()
   329  	}
   330  }
   331  
   332  // Emits stats based on a caller or server failure, inferred by a YARPC Code.
   333  func (c call) endStatsFromFault(elapsed time.Duration, code yarpcerrors.Code, applicationErrorName string) {
   334  	switch yarpcerrors.GetFaultTypeFromCode(code) {
   335  	case yarpcerrors.ClientFault:
   336  		c.edge.callerErrLatencies.Observe(elapsed)
   337  		if counter, err := c.edge.callerFailures.Get(
   338  			_error, code.String(),
   339  			_errorNameMetricsKey, applicationErrorName,
   340  		); err == nil {
   341  			counter.Inc()
   342  		}
   343  
   344  	case yarpcerrors.ServerFault:
   345  		c.edge.serverErrLatencies.Observe(elapsed)
   346  		if counter, err := c.edge.serverFailures.Get(
   347  			_error, code.String(),
   348  			_errorNameMetricsKey, applicationErrorName,
   349  		); err == nil {
   350  			counter.Inc()
   351  		}
   352  		if code == yarpcerrors.CodeDeadlineExceeded {
   353  			if deadlineTime, ok := c.ctx.Deadline(); ok {
   354  				c.edge.timeoutTtls.Observe(deadlineTime.Sub(c.started))
   355  			}
   356  		}
   357  
   358  	default:
   359  		// If this code is executed we've hit an error code outside the usual error
   360  		// code range, so we'll just log the string representation of that code.
   361  		c.edge.serverErrLatencies.Observe(elapsed)
   362  		if counter, err := c.edge.serverFailures.Get(
   363  			_error, code.String(),
   364  			_errorNameMetricsKey, applicationErrorName,
   365  		); err == nil {
   366  			counter.Inc()
   367  		}
   368  	}
   369  }
   370  
   371  // EndStreamHandshake should be invoked immediately after successfully creating
   372  // a stream.
   373  func (c call) EndStreamHandshake() {
   374  	c.EndStreamHandshakeWithError(nil)
   375  }
   376  
   377  // EndStreamHandshakeWithError should be invoked immediately after attempting to
   378  // create a stream.
   379  func (c call) EndStreamHandshakeWithError(err error) {
   380  	c.logStreamEvent(err, err == nil, _successStreamOpen, _errorStreamOpen)
   381  
   382  	c.edge.calls.Inc()
   383  	if err == nil {
   384  		c.edge.successes.Inc()
   385  		c.edge.streaming.streamsActive.Inc()
   386  		return
   387  	}
   388  
   389  	c.emitStreamError(err)
   390  }
   391  
   392  // EndStream should be invoked immediately after a stream closes.
   393  func (c call) EndStream(err error) {
   394  	elapsed := _timeNow().Sub(c.started)
   395  	c.logStreamEvent(err, err == nil, _successStreamClose, _errorStreamClose, zap.Duration("duration", elapsed))
   396  
   397  	c.edge.streaming.streamsActive.Dec()
   398  	c.edge.streaming.streamDurations.Observe(elapsed)
   399  	c.emitStreamError(err)
   400  }
   401  
   402  // EndStreamWithPanic ends the stream call with additional panic metrics
   403  func (c call) EndStreamWithPanic(err error) {
   404  	c.edge.panics.Inc()
   405  	c.EndStream(err)
   406  }
   407  
   408  // This function resembles EndStats for unary calls. However, we do not special
   409  // case application errors and it does not measure failure latencies as those
   410  // measurements are irrelevant for streams.
   411  func (c call) emitStreamError(err error) {
   412  	if err == nil {
   413  		return
   414  	}
   415  
   416  	if !yarpcerrors.IsStatus(err) {
   417  		if counter, err := c.edge.serverFailures.Get(
   418  			_error, "unknown_internal_yarpc",
   419  			_errorNameMetricsKey, _notSet,
   420  		); err == nil {
   421  			counter.Inc()
   422  		}
   423  		return
   424  	}
   425  
   426  	// Emit finer grained metrics since the error is a yarpcerrors.Status.
   427  	errCode := yarpcerrors.FromError(err).Code()
   428  
   429  	switch yarpcerrors.GetFaultTypeFromCode(yarpcerrors.FromError(err).Code()) {
   430  	case yarpcerrors.ClientFault:
   431  		if counter, err2 := c.edge.callerFailures.Get(
   432  			_error, errCode.String(),
   433  			_errorNameMetricsKey, _notSet,
   434  		); err2 != nil {
   435  			c.edge.logger.DPanic("could not retrieve caller failures counter", zap.Error(err2))
   436  		} else {
   437  			counter.Inc()
   438  		}
   439  
   440  	case yarpcerrors.ServerFault:
   441  		if counter, err2 := c.edge.serverFailures.Get(
   442  			_error, errCode.String(),
   443  			_errorNameMetricsKey, _notSet,
   444  		); err2 != nil {
   445  			c.edge.logger.DPanic("could not retrieve server failures counter", zap.Error(err2))
   446  		} else {
   447  			counter.Inc()
   448  		}
   449  
   450  	default:
   451  		// If this code is executed we've hit an error code outside the usual error
   452  		// code range, so we'll just log the string representation of that code.
   453  		if counter, err2 := c.edge.serverFailures.Get(
   454  			_error, errCode.String(),
   455  			_errorNameMetricsKey, _notSet,
   456  		); err2 != nil {
   457  			c.edge.logger.DPanic("could not retrieve server failures counter", zap.Error(err2))
   458  		} else {
   459  			counter.Inc()
   460  		}
   461  	}
   462  }
   463  
   464  // logStreamEvent is a generic logging function useful for logging stream
   465  // events.
   466  func (c call) logStreamEvent(err error, success bool, succMsg, errMsg string, extraFields ...zap.Field) {
   467  	var ce *zapcore.CheckedEntry
   468  	// Success is usually only when the error is nil.
   469  	// The only exception is when emitting an error from ReceiveMessage, which
   470  	// returns EOF when the stream closes normally.
   471  	if success {
   472  		ce = c.edge.logger.Check(c.levels.success, succMsg)
   473  	} else {
   474  		var lvl zapcore.Level
   475  		if c.levels.useApplicationErrorFailureLevels {
   476  			lvl = c.levels.failure
   477  		} else {
   478  			lvl = c.levels.serverError
   479  			code := yarpcerrors.FromError(err).Code()
   480  			if fault := yarpcerrors.GetFaultTypeFromCode(code); fault == yarpcerrors.ClientFault {
   481  				lvl = c.levels.clientError
   482  			}
   483  		}
   484  
   485  		ce = c.edge.logger.Check(lvl, errMsg)
   486  	}
   487  
   488  	fields := []zap.Field{
   489  		zap.String("rpcType", c.rpcType.String()),
   490  		zap.Bool("successful", success),
   491  		c.extract(c.ctx),
   492  		zap.Error(err), // no-op if err == nil
   493  	}
   494  	fields = append(fields, extraFields...)
   495  
   496  	ce.Write(fields...)
   497  }
   498  
   499  // inteded for metric tags, this returns the yarpcerrors.Status error code name
   500  // or "unknown_internal_yarpc"
   501  func errToMetricString(err error) string {
   502  	if yarpcerrors.IsStatus(err) {
   503  		return yarpcerrors.FromError(err).Code().String()
   504  	}
   505  	return "unknown_internal_yarpc"
   506  }