go.uber.org/yarpc@v1.72.1/internal/observability/graph.go (about) 1 // Copyright (c) 2022 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package observability 22 23 import ( 24 "context" 25 "sync" 26 "time" 27 28 "go.uber.org/net/metrics" 29 "go.uber.org/net/metrics/bucket" 30 "go.uber.org/yarpc/api/transport" 31 "go.uber.org/yarpc/internal/digester" 32 "go.uber.org/zap" 33 "go.uber.org/zap/zapcore" 34 ) 35 36 var ( 37 _timeNow = time.Now // for tests 38 _defaultGraphSize = 128 39 // Latency buckets for histograms. At some point, we may want to make these 40 // configurable. 41 _bucketsMs = bucket.NewRPCLatency() 42 // Bytes buckets for payload size histograms, containing exponential buckets 43 // in range of 0B, 1B, 2B, ... 256MB. 44 _bucketsBytes = append([]int64{0}, bucket.NewExponential(1, 2, 29)...) 45 ) 46 47 type directionName string 48 49 const ( 50 _directionOutbound directionName = "outbound" 51 _directionInbound directionName = "inbound" 52 53 // _droppedTagValue represents the value of a metric tag when the tag 54 // is being blocked. 55 _droppedTagValue = "__dropped__" 56 57 _source = "source" 58 _dest = "dest" 59 _transport = "transport" 60 _procedure = "procedure" 61 _encoding = "encoding" 62 _routingKey = "routing_key" 63 _routingDelegate = "routing_delegate" 64 _direction = "direction" 65 _rpcType = "rpc_type" 66 ) 67 68 // A graph represents a collection of services: each service is a node, and we 69 // collect stats for each caller-callee-transport-encoding-procedure-rk-sk-rd edge. 70 type graph struct { 71 meter *metrics.Scope 72 logger *zap.Logger 73 extract ContextExtractor 74 ignoreMetricsTag *metricsTagIgnore 75 76 edgesMu sync.RWMutex 77 edges map[string]*edge 78 79 inboundLevels, outboundLevels levels 80 } 81 82 // if the field is set to true, the metrics tag won't be emitted 83 // it is a more performant way than have a map (field lookup versus map lookup) 84 type metricsTagIgnore struct { 85 source bool 86 dest bool 87 transport bool 88 procedure bool 89 encoding bool 90 routingKey bool 91 routingDelegate bool 92 direction bool 93 rpcType bool 94 } 95 96 func newMetricsTagIgnore(metricTagsIgnore []string) *metricsTagIgnore { 97 r := new(metricsTagIgnore) 98 for _, m := range metricTagsIgnore { 99 switch m { 100 case _source: 101 r.source = true 102 case _dest: 103 r.dest = true 104 case _transport: 105 r.transport = true 106 case _procedure: 107 r.procedure = true 108 case _encoding: 109 r.encoding = true 110 case _routingKey: 111 r.routingKey = true 112 case _routingDelegate: 113 r.routingDelegate = true 114 case _direction: 115 r.direction = true 116 case _rpcType: 117 r.rpcType = true 118 } 119 } 120 return r 121 } 122 123 func (m *metricsTagIgnore) tags(req *transport.Request, direction string, rpcType transport.Type) metrics.Tags { 124 tags := metrics.Tags{ 125 _source: req.Caller, 126 _dest: req.Service, 127 _transport: unknownIfEmpty(req.Transport), 128 _procedure: req.Procedure, 129 _encoding: string(req.Encoding), 130 _routingKey: req.RoutingKey, 131 _routingDelegate: req.RoutingDelegate, 132 _direction: direction, 133 _rpcType: rpcType.String(), 134 } 135 136 if m.source { 137 tags[_source] = _droppedTagValue 138 } 139 if m.dest { 140 tags[_dest] = _droppedTagValue 141 } 142 if m.transport { 143 tags[_transport] = _droppedTagValue 144 } 145 if m.encoding { 146 tags[_encoding] = _droppedTagValue 147 } 148 if m.procedure { 149 tags[_procedure] = _droppedTagValue 150 } 151 if m.routingKey { 152 tags[_routingKey] = _droppedTagValue 153 } 154 if m.routingDelegate { 155 tags[_routingDelegate] = _droppedTagValue 156 } 157 if m.direction { 158 tags[_direction] = _droppedTagValue 159 } 160 if m.rpcType { 161 tags[_rpcType] = _droppedTagValue 162 } 163 164 return tags 165 } 166 167 func newGraph(meter *metrics.Scope, logger *zap.Logger, extract ContextExtractor, metricTagsIgnore []string) graph { 168 return graph{ 169 edges: make(map[string]*edge, _defaultGraphSize), 170 meter: meter, 171 logger: logger, 172 extract: extract, 173 ignoreMetricsTag: newMetricsTagIgnore(metricTagsIgnore), 174 inboundLevels: levels{ 175 success: zapcore.DebugLevel, 176 failure: zapcore.ErrorLevel, 177 applicationError: zapcore.ErrorLevel, 178 serverError: zapcore.ErrorLevel, 179 clientError: zapcore.ErrorLevel, 180 }, 181 outboundLevels: levels{ 182 success: zapcore.DebugLevel, 183 failure: zapcore.ErrorLevel, 184 applicationError: zapcore.ErrorLevel, 185 serverError: zapcore.ErrorLevel, 186 clientError: zapcore.ErrorLevel, 187 }, 188 } 189 } 190 191 // begin starts a call along an edge. 192 func (g *graph) begin(ctx context.Context, rpcType transport.Type, direction directionName, req *transport.Request) call { 193 now := _timeNow() 194 195 d := digester.New() 196 if !g.ignoreMetricsTag.source { 197 d.Add(req.Caller) 198 } 199 if !g.ignoreMetricsTag.dest { 200 d.Add(req.Service) 201 } 202 if !g.ignoreMetricsTag.transport { 203 d.Add(req.Transport) 204 } 205 if !g.ignoreMetricsTag.encoding { 206 d.Add(string(req.Encoding)) 207 } 208 if !g.ignoreMetricsTag.procedure { 209 d.Add(req.Procedure) 210 } 211 if !g.ignoreMetricsTag.routingKey { 212 d.Add(req.RoutingKey) 213 } 214 if !g.ignoreMetricsTag.routingDelegate { 215 d.Add(req.RoutingDelegate) 216 } 217 if !g.ignoreMetricsTag.direction { 218 d.Add(string(direction)) 219 } 220 if !g.ignoreMetricsTag.rpcType { 221 d.Add(rpcType.String()) 222 } 223 e := g.getOrCreateEdge(d.Digest(), req, string(direction), rpcType) 224 d.Free() 225 226 levels := &g.inboundLevels 227 if direction != _directionInbound { 228 levels = &g.outboundLevels 229 } 230 231 return call{ 232 edge: e, 233 extract: g.extract, 234 started: now, 235 ctx: ctx, 236 req: req, 237 rpcType: rpcType, 238 direction: direction, 239 levels: levels, 240 } 241 } 242 243 func (g *graph) getOrCreateEdge(key []byte, req *transport.Request, direction string, rpcType transport.Type) *edge { 244 if e := g.getEdge(key); e != nil { 245 return e 246 } 247 return g.createEdge(key, req, direction, rpcType) 248 } 249 250 func (g *graph) getEdge(key []byte) *edge { 251 g.edgesMu.RLock() 252 e := g.edges[string(key)] 253 g.edgesMu.RUnlock() 254 return e 255 } 256 257 func (g *graph) createEdge(key []byte, req *transport.Request, direction string, rpcType transport.Type) *edge { 258 g.edgesMu.Lock() 259 // Since we'll rarely hit this code path, the overhead of defer is acceptable. 260 defer g.edgesMu.Unlock() 261 262 if e, ok := g.edges[string(key)]; ok { 263 // Someone beat us to the punch. 264 return e 265 } 266 267 e := newEdge(g.logger, g.meter, g.ignoreMetricsTag, req, direction, rpcType) 268 g.edges[string(key)] = e 269 return e 270 } 271 272 // An edge is a collection of RPC stats for a particular 273 // caller-callee-encoding-procedure-sk-rd-rk edge in the service graph. 274 type edge struct { 275 logger *zap.Logger 276 277 calls *metrics.Counter 278 successes *metrics.Counter 279 panics *metrics.Counter 280 callerFailures *metrics.CounterVector 281 serverFailures *metrics.CounterVector 282 283 latencies *metrics.Histogram 284 callerErrLatencies *metrics.Histogram 285 serverErrLatencies *metrics.Histogram 286 ttls *metrics.Histogram 287 timeoutTtls *metrics.Histogram 288 requestPayloadSizes *metrics.Histogram 289 responsePayloadSizes *metrics.Histogram 290 streaming *streamEdge 291 } 292 293 // streamEdge metrics should only be used for streaming requests. 294 type streamEdge struct { 295 sends *metrics.Counter 296 sendSuccesses *metrics.Counter 297 sendFailures *metrics.CounterVector 298 299 receives *metrics.Counter 300 receiveSuccesses *metrics.Counter 301 receiveFailures *metrics.CounterVector 302 303 streamDurations *metrics.Histogram 304 streamRequestPayloadSizes *metrics.Histogram 305 streamResponsePayloadSizes *metrics.Histogram 306 307 streamsActive *metrics.Gauge 308 } 309 310 // newEdge constructs a new edge. Since Registries enforce metric uniqueness, 311 // edges should be cached and re-used for each RPC. 312 func newEdge(logger *zap.Logger, meter *metrics.Scope, tagToIgnore *metricsTagIgnore, req *transport.Request, direction string, rpcType transport.Type) *edge { 313 tags := tagToIgnore.tags(req, direction, rpcType) 314 315 // metrics for all RPCs 316 calls, err := meter.Counter(metrics.Spec{ 317 Name: "calls", 318 Help: "Total number of RPCs.", 319 ConstTags: tags, 320 }) 321 if err != nil { 322 logger.Error("Failed to create calls counter.", zap.Error(err)) 323 } 324 successes, err := meter.Counter(metrics.Spec{ 325 Name: "successes", 326 Help: "Number of successful RPCs.", 327 ConstTags: tags, 328 }) 329 if err != nil { 330 logger.Error("Failed to create successes counter.", zap.Error(err)) 331 } 332 panics, err := meter.Counter(metrics.Spec{ 333 Name: "panics", 334 Help: "Number of RPCs failed because of panic.", 335 ConstTags: tags, 336 }) 337 if err != nil { 338 logger.Error("Failed to create panics counter.", zap.Error(err)) 339 } 340 callerFailures, err := meter.CounterVector(metrics.Spec{ 341 Name: "caller_failures", 342 Help: "Number of RPCs failed because of caller error.", 343 ConstTags: tags, 344 VarTags: []string{_error, _errorNameMetricsKey}, 345 }) 346 if err != nil { 347 logger.Error("Failed to create caller failures vector.", zap.Error(err)) 348 } 349 serverFailures, err := meter.CounterVector(metrics.Spec{ 350 Name: "server_failures", 351 Help: "Number of RPCs failed because of server error.", 352 ConstTags: tags, 353 VarTags: []string{_error, _errorNameMetricsKey}, 354 }) 355 if err != nil { 356 logger.Error("Failed to create server failures vector.", zap.Error(err)) 357 } 358 359 // metrics for only unary and oneway 360 var latencies, callerErrLatencies, serverErrLatencies, ttls, timeoutTtls, 361 requestPayloadSizes, responsePayloadSizes *metrics.Histogram 362 if rpcType == transport.Unary || rpcType == transport.Oneway { 363 latencies, err = meter.Histogram(metrics.HistogramSpec{ 364 Spec: metrics.Spec{ 365 Name: "success_latency_ms", 366 Help: "Latency distribution of successful RPCs.", 367 ConstTags: tags, 368 }, 369 Unit: time.Millisecond, 370 Buckets: _bucketsMs, 371 }) 372 if err != nil { 373 logger.Error("Failed to create success latency distribution.", zap.Error(err)) 374 } 375 callerErrLatencies, err = meter.Histogram(metrics.HistogramSpec{ 376 Spec: metrics.Spec{ 377 Name: "caller_failure_latency_ms", 378 Help: "Latency distribution of RPCs failed because of caller error.", 379 ConstTags: tags, 380 }, 381 Unit: time.Millisecond, 382 Buckets: _bucketsMs, 383 }) 384 if err != nil { 385 logger.Error("Failed to create caller failure latency distribution.", zap.Error(err)) 386 } 387 serverErrLatencies, err = meter.Histogram(metrics.HistogramSpec{ 388 Spec: metrics.Spec{ 389 Name: "server_failure_latency_ms", 390 Help: "Latency distribution of RPCs failed because of server error.", 391 ConstTags: tags, 392 }, 393 Unit: time.Millisecond, 394 Buckets: _bucketsMs, 395 }) 396 if err != nil { 397 logger.Error("Failed to create server failure latency distribution.", zap.Error(err)) 398 } 399 ttls, err = meter.Histogram(metrics.HistogramSpec{ 400 Spec: metrics.Spec{ 401 Name: "ttl_ms", 402 Help: "TTL distribution of the RPCs passed by the caller", 403 ConstTags: tags, 404 }, 405 Unit: time.Millisecond, 406 Buckets: _bucketsMs, 407 }) 408 if err != nil { 409 logger.Error("Failed to create ttl distribution.", zap.Error(err)) 410 } 411 timeoutTtls, err = meter.Histogram(metrics.HistogramSpec{ 412 Spec: metrics.Spec{ 413 Name: "timeout_ttl_ms", 414 Help: "TTL distribution of the RPCs passed by caller which failed due to timeout", 415 ConstTags: tags, 416 }, 417 Unit: time.Millisecond, 418 Buckets: _bucketsMs, 419 }) 420 if err != nil { 421 logger.Error("Failed to create timeout ttl distribution.", zap.Error(err)) 422 } 423 requestPayloadSizes, err = meter.Histogram(metrics.HistogramSpec{ 424 Spec: metrics.Spec{ 425 Name: "request_payload_size_bytes", 426 Help: "Request payload size distribution of the RPCs in bytes", 427 ConstTags: tags, 428 }, 429 Unit: time.Millisecond, // Unit is relevent for this histogram 430 Buckets: _bucketsBytes, 431 }) 432 if err != nil { 433 logger.Error("Failed to create request payload size histogram.", zap.Error(err)) 434 } 435 responsePayloadSizes, err = meter.Histogram(metrics.HistogramSpec{ 436 Spec: metrics.Spec{ 437 Name: "response_payload_size_bytes", 438 Help: "Response payload size distribution of the RPCs in bytes", 439 ConstTags: tags, 440 }, 441 Unit: time.Millisecond, // Unit is relevent for this histogram 442 Buckets: _bucketsBytes, 443 }) 444 if err != nil { 445 logger.Error("Failed to create response payload size histogram.", zap.Error(err)) 446 } 447 } 448 449 // metrics for only streams 450 var streaming *streamEdge 451 if rpcType == transport.Streaming { 452 // sends 453 sends, err := meter.Counter(metrics.Spec{ 454 Name: "stream_sends", 455 Help: "Total number of streaming messages sent.", 456 ConstTags: tags, 457 }) 458 if err != nil { 459 logger.DPanic("Failed to create streaming sends counter.", zap.Error(err)) 460 } 461 sendSuccesses, err := meter.Counter(metrics.Spec{ 462 Name: "stream_send_successes", 463 Help: "Number of successful streaming messages sent.", 464 ConstTags: tags, 465 }) 466 if err != nil { 467 logger.DPanic("Failed to create streaming sends successes counter.", zap.Error(err)) 468 } 469 sendFailures, err := meter.CounterVector(metrics.Spec{ 470 Name: "stream_send_failures", 471 Help: "Number streaming messages that failed to send.", 472 ConstTags: tags, 473 VarTags: []string{_error}, 474 }) 475 if err != nil { 476 logger.DPanic("Failed to create streaming sends failure counter.", zap.Error(err)) 477 } 478 479 // receives 480 receives, err := meter.Counter(metrics.Spec{ 481 Name: "stream_receives", 482 Help: "Total number of streaming messages recevied.", 483 ConstTags: tags, 484 }) 485 if err != nil { 486 logger.DPanic("Failed to create streaming receives counter.", zap.Error(err)) 487 } 488 receiveSuccesses, err := meter.Counter(metrics.Spec{ 489 Name: "stream_receive_successes", 490 Help: "Number of successful streaming messages received.", 491 ConstTags: tags, 492 }) 493 if err != nil { 494 logger.DPanic("Failed to create streaming receives successes counter.", zap.Error(err)) 495 } 496 receiveFailures, err := meter.CounterVector(metrics.Spec{ 497 Name: "stream_receive_failures", 498 Help: "Number streaming messages failed to be recieved.", 499 ConstTags: tags, 500 VarTags: []string{_error}, 501 }) 502 if err != nil { 503 logger.DPanic("Failed to create streaming receives failure counter.", zap.Error(err)) 504 } 505 506 // entire stream 507 streamDurations, err := meter.Histogram(metrics.HistogramSpec{ 508 Spec: metrics.Spec{ 509 Name: "stream_duration_ms", 510 Help: "Latency distribution of total stream duration.", 511 ConstTags: tags, 512 }, 513 Unit: time.Millisecond, 514 Buckets: _bucketsMs, 515 }) 516 if err != nil { 517 logger.DPanic("Failed to create stream duration histogram.", zap.Error(err)) 518 } 519 520 streamRequestPayloadSizes, err := meter.Histogram(metrics.HistogramSpec{ 521 Spec: metrics.Spec{ 522 Name: "stream_request_payload_size_bytes", 523 Help: "Stream request payload size distribution", 524 ConstTags: tags, 525 }, 526 Unit: time.Millisecond, 527 Buckets: _bucketsBytes, 528 }) 529 if err != nil { 530 logger.DPanic("Failed to create stream request payload size histogram", zap.Error(err)) 531 } 532 533 streamResponsePayloadSizes, err := meter.Histogram(metrics.HistogramSpec{ 534 Spec: metrics.Spec{ 535 Name: "stream_response_payload_size_bytes", 536 Help: "Stream response payload size distribution", 537 ConstTags: tags, 538 }, 539 Unit: time.Millisecond, 540 Buckets: _bucketsBytes, 541 }) 542 if err != nil { 543 logger.DPanic("Failed to create stream response payload size histogram", zap.Error(err)) 544 } 545 546 streamsActive, err := meter.Gauge(metrics.Spec{ 547 Name: "streams_active", 548 Help: "Number of active streams.", 549 ConstTags: tags, 550 }) 551 if err != nil { 552 logger.DPanic("Failed to create active stream gauge.", zap.Error(err)) 553 } 554 555 streaming = &streamEdge{ 556 sends: sends, 557 sendSuccesses: sendSuccesses, 558 sendFailures: sendFailures, 559 receives: receives, 560 receiveSuccesses: receiveSuccesses, 561 receiveFailures: receiveFailures, 562 563 streamDurations: streamDurations, 564 streamRequestPayloadSizes: streamRequestPayloadSizes, 565 streamResponsePayloadSizes: streamResponsePayloadSizes, 566 567 streamsActive: streamsActive, 568 } 569 } 570 571 logger = logger.With( 572 zap.String("source", req.Caller), 573 zap.String("dest", req.Service), 574 zap.String("transport", unknownIfEmpty(req.Transport)), 575 zap.String("procedure", req.Procedure), 576 zap.String("encoding", string(req.Encoding)), 577 zap.String("routingKey", req.RoutingKey), 578 zap.String("routingDelegate", req.RoutingDelegate), 579 zap.String("direction", direction), 580 ) 581 return &edge{ 582 logger: logger, 583 calls: calls, 584 successes: successes, 585 panics: panics, 586 callerFailures: callerFailures, 587 serverFailures: serverFailures, 588 requestPayloadSizes: requestPayloadSizes, 589 responsePayloadSizes: responsePayloadSizes, 590 latencies: latencies, 591 callerErrLatencies: callerErrLatencies, 592 serverErrLatencies: serverErrLatencies, 593 ttls: ttls, 594 timeoutTtls: timeoutTtls, 595 streaming: streaming, 596 } 597 } 598 599 // unknownIfEmpty works around hard-coded default value of "default" in go.uber.org/net/metrics 600 func unknownIfEmpty(t string) string { 601 if t == "" { 602 t = "unknown" 603 } 604 return t 605 }