go.uber.org/yarpc@v1.72.1/internal/observability/call.go (about) 1 // Copyright (c) 2022 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package observability 22 23 import ( 24 "context" 25 "fmt" 26 "time" 27 28 "go.uber.org/yarpc/api/transport" 29 "go.uber.org/yarpc/yarpcerrors" 30 "go.uber.org/zap" 31 "go.uber.org/zap/zapcore" 32 ) 33 34 const ( 35 _error = "error" 36 37 _errorNameMetricsKey = "error_name" 38 _notSet = "__not_set__" 39 40 _errorNameLogKey = "errorName" 41 _errorCodeLogKey = "errorCode" 42 _errorDetailsLogKey = "errorDetails" 43 44 _successfulInbound = "Handled inbound request." 45 _successfulOutbound = "Made outbound call." 46 _errorInbound = "Error handling inbound request." 47 _errorOutbound = "Error making outbound call." 48 49 _successStreamOpen = "Successfully created stream" 50 _successStreamClose = "Successfully closed stream" 51 _errorStreamOpen = "Error creating stream" 52 _errorStreamClose = "Error closing stream" 53 54 _dropped = "dropped" 55 _droppedAppErrLog = "dropped application error due to context timeout or cancelation" 56 _droppedErrLogFmt = "dropped error due to context timeout or cancelation: %v" 57 _droppedSuccessLog = "dropped handler success due to context timeout or cancelation" 58 ) 59 60 // A call represents a single RPC along an edge. 61 // 62 // To prevent allocating on the heap on the request path, it's a value instead 63 // of a pointer. 64 type call struct { 65 edge *edge 66 extract ContextExtractor 67 68 started time.Time 69 ctx context.Context 70 req *transport.Request 71 rpcType transport.Type 72 direction directionName 73 74 levels *levels 75 } 76 77 type callResult struct { 78 err error 79 ctxOverrideErr error 80 81 isApplicationError bool 82 applicationErrorMeta *transport.ApplicationErrorMeta 83 84 requestSize int 85 responseSize int 86 } 87 88 type levels struct { 89 success, failure, applicationError, serverError, clientError zapcore.Level 90 91 // useApplicationErrorFailureLevels is used to know which levels should be 92 // used between applicationError/failure and clientError/serverError. 93 // by default serverError and clientError will be used. 94 // useApplicationErrorFailureLevels should be set to true if applicationError 95 // or failure are set. 96 useApplicationErrorFailureLevels bool 97 } 98 99 func (c call) End(res callResult) { 100 c.endWithAppError(res) 101 } 102 103 func (c call) EndCallWithAppError(res callResult) { 104 c.endWithAppError(res) 105 } 106 107 func (c call) EndHandleWithAppError(res callResult) { 108 if res.ctxOverrideErr == nil { 109 c.endWithAppError(res) 110 return 111 } 112 113 // We'll override the user's response with the appropriate context error. Also, log 114 // the dropped response. 115 var droppedField zap.Field 116 if res.isApplicationError && res.err == nil { // Thrift exceptions 117 droppedField = zap.String(_dropped, _droppedAppErrLog) 118 } else if res.err != nil { // other errors 119 droppedField = zap.String(_dropped, fmt.Sprintf(_droppedErrLogFmt, res.err)) 120 } else { 121 droppedField = zap.String(_dropped, _droppedSuccessLog) 122 } 123 124 c.endWithAppError( 125 callResult{ 126 err: res.ctxOverrideErr, 127 requestSize: res.requestSize, 128 responseSize: res.responseSize, 129 }, 130 droppedField, 131 ) 132 } 133 134 func (c call) endWithAppError( 135 res callResult, 136 extraLogFields ...zap.Field) { 137 elapsed := _timeNow().Sub(c.started) 138 c.endLogs(elapsed, res.err, res.isApplicationError, res.applicationErrorMeta, extraLogFields...) 139 c.endStats(elapsed, res) 140 } 141 142 // EndWithPanic ends the call with additional panic metrics 143 func (c call) EndWithPanic(err error) { 144 c.edge.panics.Inc() 145 c.endWithAppError(callResult{err: err}) 146 } 147 148 func (c call) endLogs( 149 elapsed time.Duration, 150 err error, 151 isApplicationError bool, 152 applicationErrorMeta *transport.ApplicationErrorMeta, 153 extraLogFields ...zap.Field) { 154 appErrBitWithNoError := isApplicationError && err == nil // ie Thrift exception 155 156 var ce *zapcore.CheckedEntry 157 if err == nil && !isApplicationError { 158 msg := _successfulInbound 159 if c.direction != _directionInbound { 160 msg = _successfulOutbound 161 } 162 ce = c.edge.logger.Check(c.levels.success, msg) 163 } else { 164 msg := _errorInbound 165 if c.direction != _directionInbound { 166 msg = _errorOutbound 167 } 168 169 var lvl zapcore.Level 170 // use applicationError and failure logging levels 171 // this is deprecated and will only be used by yarpc service 172 // which have added configuration for loggers. 173 if c.levels.useApplicationErrorFailureLevels { 174 lvl = c.levels.failure 175 176 // For logging purposes, application errors are 177 // - Thrift exceptions (appErrBitWithNoError == true) 178 // - `yarpcerror`s with error details (ie created with `encoding/protobuf.NewError`) 179 // 180 // This will be the least surprising behavior for users migrating from 181 // Thrift exceptions to Protobuf error details. 182 // 183 // Unfortunately, all errors returned from a Protobuf handler are marked as 184 // an application error on the 'transport.ResponseWriter'. Therefore, we 185 // distinguish an application error from a regular error by inspecting if an 186 // error detail was set. 187 // 188 // https://github.com/yarpc/yarpc-go/pull/1912 189 hasErrDetails := len(yarpcerrors.FromError(err).Details()) > 0 190 if appErrBitWithNoError || (isApplicationError && hasErrDetails) { 191 lvl = c.levels.applicationError 192 } 193 } else { 194 var code *yarpcerrors.Code 195 lvl = c.levels.serverError 196 197 if appErrBitWithNoError { // thrift exception 198 if applicationErrorMeta != nil && applicationErrorMeta.Code != nil { // thrift exception with rpc.code 199 code = applicationErrorMeta.Code 200 } else { 201 lvl = c.levels.clientError 202 } 203 } 204 205 if err != nil { // tchannel/HTTP/gRPC errors 206 c := yarpcerrors.FromError(err).Code() 207 code = &c 208 } 209 210 if code != nil { 211 if fault := yarpcerrors.GetFaultTypeFromCode(*code); fault == yarpcerrors.ClientFault { 212 lvl = c.levels.clientError 213 } 214 } 215 } 216 217 ce = c.edge.logger.Check(lvl, msg) 218 } 219 220 if ce == nil { 221 return 222 } 223 224 fields := make([]zapcore.Field, 0, 9+len(extraLogFields)) 225 fields = append(fields, zap.String("rpcType", c.rpcType.String())) 226 fields = append(fields, zap.Duration("latency", elapsed)) 227 fields = append(fields, zap.Bool("successful", err == nil && !isApplicationError)) 228 fields = append(fields, c.extract(c.ctx)) 229 if deadlineTime, ok := c.ctx.Deadline(); ok { 230 fields = append(fields, zap.Duration("timeout", deadlineTime.Sub(c.started))) 231 } 232 233 if appErrBitWithNoError { // Thrift exception 234 fields = append(fields, zap.String(_error, "application_error")) 235 if applicationErrorMeta != nil { 236 if applicationErrorMeta.Code != nil { 237 fields = append(fields, zap.String(_errorCodeLogKey, applicationErrorMeta.Code.String())) 238 } 239 if applicationErrorMeta.Name != "" { 240 fields = append(fields, zap.String(_errorNameLogKey, applicationErrorMeta.Name)) 241 } 242 if applicationErrorMeta.Details != "" { 243 fields = append(fields, zap.String(_errorDetailsLogKey, applicationErrorMeta.Details)) 244 } 245 } 246 247 } else if isApplicationError { // Protobuf error 248 fields = append(fields, zap.Error(err)) 249 fields = append(fields, zap.String(_errorCodeLogKey, yarpcerrors.FromError(err).Code().String())) 250 if applicationErrorMeta != nil { 251 // ignore transport.ApplicationErrorMeta#Code, since we should get this 252 // directly from the error 253 if applicationErrorMeta.Name != "" { 254 fields = append(fields, zap.String(_errorNameLogKey, applicationErrorMeta.Name)) 255 } 256 if applicationErrorMeta.Details != "" { 257 fields = append(fields, zap.String(_errorDetailsLogKey, applicationErrorMeta.Details)) 258 } 259 } 260 261 } else if err != nil { // unknown error 262 fields = append(fields, zap.Error(err)) 263 fields = append(fields, zap.String(_errorCodeLogKey, yarpcerrors.FromError(err).Code().String())) 264 } 265 266 fields = append(fields, extraLogFields...) 267 ce.Write(fields...) 268 } 269 270 func (c call) endStats( 271 elapsed time.Duration, 272 res callResult, 273 ) { 274 c.edge.calls.Inc() 275 276 if deadlineTime, ok := c.ctx.Deadline(); ok { 277 c.edge.ttls.Observe(deadlineTime.Sub(c.started)) 278 } 279 280 c.edge.requestPayloadSizes.IncBucket(int64(res.requestSize)) 281 282 if res.err == nil && !res.isApplicationError { 283 c.edge.successes.Inc() 284 c.edge.latencies.Observe(elapsed) 285 286 if c.rpcType == transport.Unary { 287 c.edge.responsePayloadSizes.IncBucket(int64(res.responseSize)) 288 } 289 return 290 } 291 292 appErrorName := _notSet 293 if res.applicationErrorMeta != nil && res.applicationErrorMeta.Name != "" { 294 appErrorName = res.applicationErrorMeta.Name 295 } 296 297 if yarpcerrors.IsStatus(res.err) { 298 status := yarpcerrors.FromError(res.err) 299 errCode := status.Code() 300 c.endStatsFromFault(elapsed, errCode, appErrorName) 301 return 302 } 303 304 if res.isApplicationError { 305 if res.applicationErrorMeta != nil && res.applicationErrorMeta.Code != nil { 306 c.endStatsFromFault(elapsed, *res.applicationErrorMeta.Code, appErrorName) 307 return 308 } 309 310 // It is an application error but not a Status and no YARPC Code is found. 311 // Assume it's a caller's fault and emit generic error data. 312 c.edge.callerErrLatencies.Observe(elapsed) 313 314 if counter, err := c.edge.callerFailures.Get( 315 _error, "application_error", 316 _errorNameMetricsKey, appErrorName, 317 ); err == nil { 318 counter.Inc() 319 } 320 return 321 } 322 323 c.edge.serverErrLatencies.Observe(elapsed) 324 if counter, err := c.edge.serverFailures.Get( 325 _error, "unknown_internal_yarpc", 326 _errorNameMetricsKey, _notSet, 327 ); err == nil { 328 counter.Inc() 329 } 330 } 331 332 // Emits stats based on a caller or server failure, inferred by a YARPC Code. 333 func (c call) endStatsFromFault(elapsed time.Duration, code yarpcerrors.Code, applicationErrorName string) { 334 switch yarpcerrors.GetFaultTypeFromCode(code) { 335 case yarpcerrors.ClientFault: 336 c.edge.callerErrLatencies.Observe(elapsed) 337 if counter, err := c.edge.callerFailures.Get( 338 _error, code.String(), 339 _errorNameMetricsKey, applicationErrorName, 340 ); err == nil { 341 counter.Inc() 342 } 343 344 case yarpcerrors.ServerFault: 345 c.edge.serverErrLatencies.Observe(elapsed) 346 if counter, err := c.edge.serverFailures.Get( 347 _error, code.String(), 348 _errorNameMetricsKey, applicationErrorName, 349 ); err == nil { 350 counter.Inc() 351 } 352 if code == yarpcerrors.CodeDeadlineExceeded { 353 if deadlineTime, ok := c.ctx.Deadline(); ok { 354 c.edge.timeoutTtls.Observe(deadlineTime.Sub(c.started)) 355 } 356 } 357 358 default: 359 // If this code is executed we've hit an error code outside the usual error 360 // code range, so we'll just log the string representation of that code. 361 c.edge.serverErrLatencies.Observe(elapsed) 362 if counter, err := c.edge.serverFailures.Get( 363 _error, code.String(), 364 _errorNameMetricsKey, applicationErrorName, 365 ); err == nil { 366 counter.Inc() 367 } 368 } 369 } 370 371 // EndStreamHandshake should be invoked immediately after successfully creating 372 // a stream. 373 func (c call) EndStreamHandshake() { 374 c.EndStreamHandshakeWithError(nil) 375 } 376 377 // EndStreamHandshakeWithError should be invoked immediately after attempting to 378 // create a stream. 379 func (c call) EndStreamHandshakeWithError(err error) { 380 c.logStreamEvent(err, err == nil, _successStreamOpen, _errorStreamOpen) 381 382 c.edge.calls.Inc() 383 if err == nil { 384 c.edge.successes.Inc() 385 c.edge.streaming.streamsActive.Inc() 386 return 387 } 388 389 c.emitStreamError(err) 390 } 391 392 // EndStream should be invoked immediately after a stream closes. 393 func (c call) EndStream(err error) { 394 elapsed := _timeNow().Sub(c.started) 395 c.logStreamEvent(err, err == nil, _successStreamClose, _errorStreamClose, zap.Duration("duration", elapsed)) 396 397 c.edge.streaming.streamsActive.Dec() 398 c.edge.streaming.streamDurations.Observe(elapsed) 399 c.emitStreamError(err) 400 } 401 402 // EndStreamWithPanic ends the stream call with additional panic metrics 403 func (c call) EndStreamWithPanic(err error) { 404 c.edge.panics.Inc() 405 c.EndStream(err) 406 } 407 408 // This function resembles EndStats for unary calls. However, we do not special 409 // case application errors and it does not measure failure latencies as those 410 // measurements are irrelevant for streams. 411 func (c call) emitStreamError(err error) { 412 if err == nil { 413 return 414 } 415 416 if !yarpcerrors.IsStatus(err) { 417 if counter, err := c.edge.serverFailures.Get( 418 _error, "unknown_internal_yarpc", 419 _errorNameMetricsKey, _notSet, 420 ); err == nil { 421 counter.Inc() 422 } 423 return 424 } 425 426 // Emit finer grained metrics since the error is a yarpcerrors.Status. 427 errCode := yarpcerrors.FromError(err).Code() 428 429 switch yarpcerrors.GetFaultTypeFromCode(yarpcerrors.FromError(err).Code()) { 430 case yarpcerrors.ClientFault: 431 if counter, err2 := c.edge.callerFailures.Get( 432 _error, errCode.String(), 433 _errorNameMetricsKey, _notSet, 434 ); err2 != nil { 435 c.edge.logger.DPanic("could not retrieve caller failures counter", zap.Error(err2)) 436 } else { 437 counter.Inc() 438 } 439 440 case yarpcerrors.ServerFault: 441 if counter, err2 := c.edge.serverFailures.Get( 442 _error, errCode.String(), 443 _errorNameMetricsKey, _notSet, 444 ); err2 != nil { 445 c.edge.logger.DPanic("could not retrieve server failures counter", zap.Error(err2)) 446 } else { 447 counter.Inc() 448 } 449 450 default: 451 // If this code is executed we've hit an error code outside the usual error 452 // code range, so we'll just log the string representation of that code. 453 if counter, err2 := c.edge.serverFailures.Get( 454 _error, errCode.String(), 455 _errorNameMetricsKey, _notSet, 456 ); err2 != nil { 457 c.edge.logger.DPanic("could not retrieve server failures counter", zap.Error(err2)) 458 } else { 459 counter.Inc() 460 } 461 } 462 } 463 464 // logStreamEvent is a generic logging function useful for logging stream 465 // events. 466 func (c call) logStreamEvent(err error, success bool, succMsg, errMsg string, extraFields ...zap.Field) { 467 var ce *zapcore.CheckedEntry 468 // Success is usually only when the error is nil. 469 // The only exception is when emitting an error from ReceiveMessage, which 470 // returns EOF when the stream closes normally. 471 if success { 472 ce = c.edge.logger.Check(c.levels.success, succMsg) 473 } else { 474 var lvl zapcore.Level 475 if c.levels.useApplicationErrorFailureLevels { 476 lvl = c.levels.failure 477 } else { 478 lvl = c.levels.serverError 479 code := yarpcerrors.FromError(err).Code() 480 if fault := yarpcerrors.GetFaultTypeFromCode(code); fault == yarpcerrors.ClientFault { 481 lvl = c.levels.clientError 482 } 483 } 484 485 ce = c.edge.logger.Check(lvl, errMsg) 486 } 487 488 fields := []zap.Field{ 489 zap.String("rpcType", c.rpcType.String()), 490 zap.Bool("successful", success), 491 c.extract(c.ctx), 492 zap.Error(err), // no-op if err == nil 493 } 494 fields = append(fields, extraFields...) 495 496 ce.Write(fields...) 497 } 498 499 // inteded for metric tags, this returns the yarpcerrors.Status error code name 500 // or "unknown_internal_yarpc" 501 func errToMetricString(err error) string { 502 if yarpcerrors.IsStatus(err) { 503 return yarpcerrors.FromError(err).Code().String() 504 } 505 return "unknown_internal_yarpc" 506 }