github.com/letsencrypt/boulder@v0.20251208.0/grpc/interceptors.go (about) 1 package grpc 2 3 import ( 4 "context" 5 "fmt" 6 "strconv" 7 "strings" 8 "time" 9 10 "github.com/jmhodges/clock" 11 "github.com/prometheus/client_golang/prometheus" 12 "google.golang.org/grpc" 13 "google.golang.org/grpc/codes" 14 "google.golang.org/grpc/credentials" 15 "google.golang.org/grpc/metadata" 16 "google.golang.org/grpc/peer" 17 "google.golang.org/grpc/status" 18 19 "github.com/letsencrypt/boulder/cmd" 20 berrors "github.com/letsencrypt/boulder/errors" 21 "github.com/letsencrypt/boulder/web" 22 ) 23 24 const ( 25 returnOverhead = 20 * time.Millisecond 26 meaningfulWorkOverhead = 100 * time.Millisecond 27 clientRequestTimeKey = "client-request-time" 28 userAgentKey = "acme-client-user-agent" 29 ) 30 31 type serverInterceptor interface { 32 Unary(ctx context.Context, req any, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (any, error) 33 Stream(srv any, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error 34 } 35 36 // noopServerInterceptor provides no-op interceptors. It can be substituted for 37 // an interceptor that has been disabled. 38 type noopServerInterceptor struct{} 39 40 // Unary is a gRPC unary interceptor. 41 func (n *noopServerInterceptor) Unary(ctx context.Context, req any, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (any, error) { 42 return handler(ctx, req) 43 } 44 45 // Stream is a gRPC stream interceptor. 46 func (n *noopServerInterceptor) Stream(srv any, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { 47 return handler(srv, ss) 48 } 49 50 // Ensure noopServerInterceptor matches the serverInterceptor interface. 51 var _ serverInterceptor = &noopServerInterceptor{} 52 53 type clientInterceptor interface { 54 Unary(ctx context.Context, method string, req any, reply any, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error 55 Stream(ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, method string, streamer grpc.Streamer, opts ...grpc.CallOption) (grpc.ClientStream, error) 56 } 57 58 // serverMetadataInterceptor is a gRPC interceptor that adds Prometheus 59 // metrics to requests handled by a gRPC server, and wraps Boulder-specific 60 // errors for transmission in a grpc/metadata trailer (see bcodes.go). 61 type serverMetadataInterceptor struct { 62 metrics serverMetrics 63 clk clock.Clock 64 } 65 66 func newServerMetadataInterceptor(metrics serverMetrics, clk clock.Clock) serverMetadataInterceptor { 67 return serverMetadataInterceptor{ 68 metrics: metrics, 69 clk: clk, 70 } 71 } 72 73 // Unary implements the grpc.UnaryServerInterceptor interface. 74 func (smi *serverMetadataInterceptor) Unary( 75 ctx context.Context, 76 req any, 77 info *grpc.UnaryServerInfo, 78 handler grpc.UnaryHandler) (any, error) { 79 if info == nil { 80 return nil, berrors.InternalServerError("passed nil *grpc.UnaryServerInfo") 81 } 82 83 // Extract the grpc metadata from the context, and handle the client request 84 // timestamp embedded in it. It's okay if the timestamp is missing, since some 85 // clients (like nomad's health-checker) don't set it. 86 md, ok := metadata.FromIncomingContext(ctx) 87 if ok { 88 if len(md[clientRequestTimeKey]) > 0 { 89 err := smi.checkLatency(md[clientRequestTimeKey][0]) 90 if err != nil { 91 return nil, err 92 } 93 } 94 if len(md[userAgentKey]) > 0 { 95 ctx = web.WithUserAgent(ctx, md[userAgentKey][0]) 96 } 97 } 98 99 // Shave 20 milliseconds off the deadline to ensure that if the RPC server times 100 // out any sub-calls it makes (like DNS lookups, or onwards RPCs), it has a 101 // chance to report that timeout to the client. This allows for more specific 102 // errors, e.g "the VA timed out looking up CAA for example.com" (when called 103 // from RA.NewCertificate, which was called from WFE.NewCertificate), as 104 // opposed to "RA.NewCertificate timed out" (causing a 500). 105 // Once we've shaved the deadline, we ensure we have we have at least another 106 // 100ms left to do work; otherwise we abort early. 107 // Note that these computations use the global clock (time.Now) instead of 108 // the local clock (smi.clk.Now) because context.WithTimeout also uses the 109 // global clock. 110 deadline, ok := ctx.Deadline() 111 // Should never happen: there was no deadline. 112 if !ok { 113 deadline = time.Now().Add(100 * time.Second) 114 } 115 deadline = deadline.Add(-returnOverhead) 116 remaining := time.Until(deadline) 117 if remaining < meaningfulWorkOverhead { 118 return nil, status.Errorf(codes.DeadlineExceeded, "not enough time left on clock: %s", remaining) 119 } 120 121 localCtx, cancel := context.WithDeadline(ctx, deadline) 122 defer cancel() 123 124 resp, err := handler(localCtx, req) 125 if err != nil { 126 err = wrapError(localCtx, err) 127 } 128 return resp, err 129 } 130 131 // interceptedServerStream wraps an existing server stream, but replaces its 132 // context with its own. 133 type interceptedServerStream struct { 134 grpc.ServerStream 135 ctx context.Context 136 } 137 138 // Context implements part of the grpc.ServerStream interface. 139 func (iss interceptedServerStream) Context() context.Context { 140 return iss.ctx 141 } 142 143 // Stream implements the grpc.StreamServerInterceptor interface. 144 func (smi *serverMetadataInterceptor) Stream( 145 srv any, 146 ss grpc.ServerStream, 147 info *grpc.StreamServerInfo, 148 handler grpc.StreamHandler) error { 149 ctx := ss.Context() 150 151 // Extract the grpc metadata from the context, and handle the client request 152 // timestamp embedded in it. It's okay if the timestamp is missing, since some 153 // clients (like nomad's health-checker) don't set it. 154 md, ok := metadata.FromIncomingContext(ctx) 155 if ok && len(md[clientRequestTimeKey]) > 0 { 156 err := smi.checkLatency(md[clientRequestTimeKey][0]) 157 if err != nil { 158 return err 159 } 160 } 161 162 // Shave 20 milliseconds off the deadline to ensure that if the RPC server times 163 // out any sub-calls it makes (like DNS lookups, or onwards RPCs), it has a 164 // chance to report that timeout to the client. This allows for more specific 165 // errors, e.g "the VA timed out looking up CAA for example.com" (when called 166 // from RA.NewCertificate, which was called from WFE.NewCertificate), as 167 // opposed to "RA.NewCertificate timed out" (causing a 500). 168 // Once we've shaved the deadline, we ensure we have we have at least another 169 // 100ms left to do work; otherwise we abort early. 170 // Note that these computations use the global clock (time.Now) instead of 171 // the local clock (smi.clk.Now) because context.WithTimeout also uses the 172 // global clock. 173 deadline, ok := ctx.Deadline() 174 // Should never happen: there was no deadline. 175 if !ok { 176 deadline = time.Now().Add(100 * time.Second) 177 } 178 deadline = deadline.Add(-returnOverhead) 179 remaining := time.Until(deadline) 180 if remaining < meaningfulWorkOverhead { 181 return status.Errorf(codes.DeadlineExceeded, "not enough time left on clock: %s", remaining) 182 } 183 184 // Server stream interceptors are synchronous (they return their error, if 185 // any, when the stream is done) so defer cancel() is safe here. 186 localCtx, cancel := context.WithDeadline(ctx, deadline) 187 defer cancel() 188 189 err := handler(srv, interceptedServerStream{ss, localCtx}) 190 if err != nil { 191 err = wrapError(localCtx, err) 192 } 193 return err 194 } 195 196 // splitMethodName is borrowed directly from 197 // `grpc-ecosystem/go-grpc-prometheus/util.go` and is used to extract the 198 // service and method name from the `method` argument to 199 // a `UnaryClientInterceptor`. 200 func splitMethodName(fullMethodName string) (string, string) { 201 fullMethodName = strings.TrimPrefix(fullMethodName, "/") // remove leading slash 202 if i := strings.Index(fullMethodName, "/"); i >= 0 { 203 return fullMethodName[:i], fullMethodName[i+1:] 204 } 205 return "unknown", "unknown" 206 } 207 208 // checkLatency is called with the `clientRequestTimeKey` value from 209 // a request's gRPC metadata. This string value is converted to a timestamp and 210 // used to calculate the latency between send and receive time. The latency is 211 // published to the server interceptor's rpcLag prometheus histogram. An error 212 // is returned if the `clientReqTime` string is not a valid timestamp, or if 213 // the latency is so large that it indicates dangerous levels of clock skew. 214 func (smi *serverMetadataInterceptor) checkLatency(clientReqTime string) error { 215 // Convert the metadata request time into an int64 216 reqTimeUnixNanos, err := strconv.ParseInt(clientReqTime, 10, 64) 217 if err != nil { 218 return berrors.InternalServerError("grpc metadata had illegal %s value: %q - %s", 219 clientRequestTimeKey, clientReqTime, err) 220 } 221 // Calculate the elapsed time since the client sent the RPC 222 reqTime := time.Unix(0, reqTimeUnixNanos) 223 elapsed := smi.clk.Since(reqTime) 224 225 // If the elapsed time is very large, that indicates it is probably due to 226 // clock skew rather than simple latency. Refuse to handle the request, since 227 // accurate timekeeping is critical to CA operations and large skew indicates 228 // something has gone very wrong. 229 if tooSkewed(elapsed) { 230 return fmt.Errorf( 231 "gRPC client reported a very different time: %s (client) vs %s (this server)", 232 reqTime, smi.clk.Now()) 233 } 234 235 // Publish an RPC latency observation to the histogram 236 smi.metrics.rpcLag.Observe(elapsed.Seconds()) 237 return nil 238 } 239 240 // Ensure serverMetadataInterceptor matches the serverInterceptor interface. 241 var _ serverInterceptor = (*serverMetadataInterceptor)(nil) 242 243 // clientMetadataInterceptor is a gRPC interceptor that adds Prometheus 244 // metrics to sent requests, and disables FailFast. We disable FailFast because 245 // non-FailFast mode is most similar to the old AMQP RPC layer: If a client 246 // makes a request while all backends are briefly down (e.g. for a restart), the 247 // request doesn't necessarily fail. A backend can service the request if it 248 // comes back up within the timeout. Under gRPC the same effect is achieved by 249 // retries up to the Context deadline. 250 type clientMetadataInterceptor struct { 251 timeout time.Duration 252 metrics clientMetrics 253 clk clock.Clock 254 255 waitForReady bool 256 } 257 258 // Unary implements the grpc.UnaryClientInterceptor interface. 259 func (cmi *clientMetadataInterceptor) Unary( 260 ctx context.Context, 261 fullMethod string, 262 req, 263 reply any, 264 cc *grpc.ClientConn, 265 invoker grpc.UnaryInvoker, 266 opts ...grpc.CallOption) error { 267 // This should not occur but fail fast with a clear error if it does (e.g. 268 // because of buggy unit test code) instead of a generic nil panic later! 269 if cmi.metrics.inFlightRPCs == nil { 270 return berrors.InternalServerError("clientInterceptor has nil inFlightRPCs gauge") 271 } 272 273 // Ensure that the context has a deadline set. 274 localCtx, cancel := context.WithTimeout(ctx, cmi.timeout) 275 defer cancel() 276 277 // Convert the current unix nano timestamp to a string for embedding in the grpc metadata 278 nowTS := strconv.FormatInt(cmi.clk.Now().UnixNano(), 10) 279 // Create a grpc/metadata.Metadata instance for the request metadata. 280 reqMD := metadata.New(map[string]string{ 281 clientRequestTimeKey: nowTS, 282 userAgentKey: web.UserAgent(ctx), 283 }) 284 // Configure the localCtx with the metadata so it gets sent along in the request 285 localCtx = metadata.NewOutgoingContext(localCtx, reqMD) 286 287 // Disable fail-fast so RPCs will retry until deadline, even if all backends 288 // are down. 289 opts = append(opts, grpc.WaitForReady(cmi.waitForReady)) 290 291 // Create a grpc/metadata.Metadata instance for a grpc.Trailer. 292 respMD := metadata.New(nil) 293 // Configure a grpc Trailer with respMD. This allows us to wrap error 294 // types in the server interceptor later on. 295 opts = append(opts, grpc.Trailer(&respMD)) 296 297 // Split the method and service name from the fullMethod. 298 // UnaryClientInterceptor's receive a `method` arg of the form 299 // "/ServiceName/MethodName" 300 service, method := splitMethodName(fullMethod) 301 // Slice the inFlightRPC inc/dec calls by method and service 302 labels := prometheus.Labels{ 303 "method": method, 304 "service": service, 305 } 306 // Increment the inFlightRPCs gauge for this method/service 307 cmi.metrics.inFlightRPCs.With(labels).Inc() 308 // And defer decrementing it when we're done 309 defer cmi.metrics.inFlightRPCs.With(labels).Dec() 310 311 // Handle the RPC 312 begin := cmi.clk.Now() 313 err := invoker(localCtx, fullMethod, req, reply, cc, opts...) 314 if err != nil { 315 err = unwrapError(err, respMD) 316 if status.Code(err) == codes.DeadlineExceeded { 317 return deadlineDetails{ 318 service: service, 319 method: method, 320 latency: cmi.clk.Since(begin), 321 } 322 } 323 } 324 return err 325 } 326 327 // interceptedClientStream wraps an existing client stream, and calls finish 328 // when the stream ends or any operation on it fails. 329 type interceptedClientStream struct { 330 grpc.ClientStream 331 finish func(error) error 332 } 333 334 // Header implements part of the grpc.ClientStream interface. 335 func (ics interceptedClientStream) Header() (metadata.MD, error) { 336 md, err := ics.ClientStream.Header() 337 if err != nil { 338 err = ics.finish(err) 339 } 340 return md, err 341 } 342 343 // SendMsg implements part of the grpc.ClientStream interface. 344 func (ics interceptedClientStream) SendMsg(m any) error { 345 err := ics.ClientStream.SendMsg(m) 346 if err != nil { 347 err = ics.finish(err) 348 } 349 return err 350 } 351 352 // RecvMsg implements part of the grpc.ClientStream interface. 353 func (ics interceptedClientStream) RecvMsg(m any) error { 354 err := ics.ClientStream.RecvMsg(m) 355 if err != nil { 356 err = ics.finish(err) 357 } 358 return err 359 } 360 361 // CloseSend implements part of the grpc.ClientStream interface. 362 func (ics interceptedClientStream) CloseSend() error { 363 err := ics.ClientStream.CloseSend() 364 if err != nil { 365 err = ics.finish(err) 366 } 367 return err 368 } 369 370 // Stream implements the grpc.StreamClientInterceptor interface. 371 func (cmi *clientMetadataInterceptor) Stream( 372 ctx context.Context, 373 desc *grpc.StreamDesc, 374 cc *grpc.ClientConn, 375 fullMethod string, 376 streamer grpc.Streamer, 377 opts ...grpc.CallOption) (grpc.ClientStream, error) { 378 // This should not occur but fail fast with a clear error if it does (e.g. 379 // because of buggy unit test code) instead of a generic nil panic later! 380 if cmi.metrics.inFlightRPCs == nil { 381 return nil, berrors.InternalServerError("clientInterceptor has nil inFlightRPCs gauge") 382 } 383 384 // We don't defer cancel() here, because this function is going to return 385 // immediately. Instead we store it in the interceptedClientStream. 386 localCtx, cancel := context.WithTimeout(ctx, cmi.timeout) 387 388 // Convert the current unix nano timestamp to a string for embedding in the grpc metadata 389 nowTS := strconv.FormatInt(cmi.clk.Now().UnixNano(), 10) 390 // Create a grpc/metadata.Metadata instance for the request metadata. 391 // Initialize it with the request time. 392 reqMD := metadata.New(map[string]string{ 393 clientRequestTimeKey: nowTS, 394 userAgentKey: web.UserAgent(ctx), 395 }) 396 // Configure the localCtx with the metadata so it gets sent along in the request 397 localCtx = metadata.NewOutgoingContext(localCtx, reqMD) 398 399 // Disable fail-fast so RPCs will retry until deadline, even if all backends 400 // are down. 401 opts = append(opts, grpc.WaitForReady(cmi.waitForReady)) 402 403 // Create a grpc/metadata.Metadata instance for a grpc.Trailer. 404 respMD := metadata.New(nil) 405 // Configure a grpc Trailer with respMD. This allows us to wrap error 406 // types in the server interceptor later on. 407 opts = append(opts, grpc.Trailer(&respMD)) 408 409 // Split the method and service name from the fullMethod. 410 // UnaryClientInterceptor's receive a `method` arg of the form 411 // "/ServiceName/MethodName" 412 service, method := splitMethodName(fullMethod) 413 // Slice the inFlightRPC inc/dec calls by method and service 414 labels := prometheus.Labels{ 415 "method": method, 416 "service": service, 417 } 418 // Increment the inFlightRPCs gauge for this method/service 419 cmi.metrics.inFlightRPCs.With(labels).Inc() 420 begin := cmi.clk.Now() 421 422 // Cancel the local context and decrement the metric when we're done. Also 423 // transform the error into a more usable form, if necessary. 424 finish := func(err error) error { 425 cancel() 426 cmi.metrics.inFlightRPCs.With(labels).Dec() 427 if err != nil { 428 err = unwrapError(err, respMD) 429 if status.Code(err) == codes.DeadlineExceeded { 430 return deadlineDetails{ 431 service: service, 432 method: method, 433 latency: cmi.clk.Since(begin), 434 } 435 } 436 } 437 return err 438 } 439 440 // Handle the RPC 441 cs, err := streamer(localCtx, desc, cc, fullMethod, opts...) 442 ics := interceptedClientStream{cs, finish} 443 return ics, err 444 } 445 446 var _ clientInterceptor = (*clientMetadataInterceptor)(nil) 447 448 // deadlineDetails is an error type that we use in place of gRPC's 449 // DeadlineExceeded errors in order to add more detail for debugging. 450 type deadlineDetails struct { 451 service string 452 method string 453 latency time.Duration 454 } 455 456 func (dd deadlineDetails) Error() string { 457 return fmt.Sprintf("%s.%s timed out after %d ms", 458 dd.service, dd.method, int64(dd.latency/time.Millisecond)) 459 } 460 461 // authInterceptor provides two server interceptors (Unary and Stream) which can 462 // check that every request for a given gRPC service is being made over an mTLS 463 // connection from a client which is allow-listed for that particular service. 464 type authInterceptor struct { 465 // serviceClientNames is a map of gRPC service names (e.g. "ca.CertificateAuthority") 466 // to allowed client certificate SANs (e.g. "ra.boulder") which are allowed to 467 // make RPCs to that service. The set of client names is implemented as a map 468 // of names to empty structs for easy lookup. 469 serviceClientNames map[string]map[string]struct{} 470 } 471 472 // newServiceAuthChecker takes a GRPCServerConfig and uses its Service stanzas 473 // to construct a serviceAuthChecker which enforces the service/client mappings 474 // contained in the config. 475 func newServiceAuthChecker(c *cmd.GRPCServerConfig) *authInterceptor { 476 names := make(map[string]map[string]struct{}) 477 for serviceName, service := range c.Services { 478 names[serviceName] = make(map[string]struct{}) 479 for _, clientName := range service.ClientNames { 480 names[serviceName][clientName] = struct{}{} 481 } 482 } 483 return &authInterceptor{names} 484 } 485 486 // Unary is a gRPC unary interceptor. 487 func (ac *authInterceptor) Unary(ctx context.Context, req any, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (any, error) { 488 err := ac.checkContextAuth(ctx, info.FullMethod) 489 if err != nil { 490 return nil, err 491 } 492 return handler(ctx, req) 493 } 494 495 // Stream is a gRPC stream interceptor. 496 func (ac *authInterceptor) Stream(srv any, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { 497 err := ac.checkContextAuth(ss.Context(), info.FullMethod) 498 if err != nil { 499 return err 500 } 501 return handler(srv, ss) 502 } 503 504 // checkContextAuth does most of the heavy lifting. It extracts TLS information 505 // from the incoming context, gets the set of DNS names contained in the client 506 // mTLS cert, and returns nil if at least one of those names appears in the set 507 // of allowed client names for given service (or if the set of allowed client 508 // names is empty). 509 func (ac *authInterceptor) checkContextAuth(ctx context.Context, fullMethod string) error { 510 serviceName, _ := splitMethodName(fullMethod) 511 512 allowedClientNames, ok := ac.serviceClientNames[serviceName] 513 if !ok || len(allowedClientNames) == 0 { 514 return fmt.Errorf("service %q has no allowed client names", serviceName) 515 } 516 517 p, ok := peer.FromContext(ctx) 518 if !ok { 519 return fmt.Errorf("unable to fetch peer info from grpc context") 520 } 521 522 if p.AuthInfo == nil { 523 return fmt.Errorf("grpc connection appears to be plaintext") 524 } 525 526 tlsAuth, ok := p.AuthInfo.(credentials.TLSInfo) 527 if !ok { 528 return fmt.Errorf("connection is not TLS authed") 529 } 530 531 if len(tlsAuth.State.VerifiedChains) == 0 || len(tlsAuth.State.VerifiedChains[0]) == 0 { 532 return fmt.Errorf("connection auth not verified") 533 } 534 535 cert := tlsAuth.State.VerifiedChains[0][0] 536 537 for _, clientName := range cert.DNSNames { 538 _, ok := allowedClientNames[clientName] 539 if ok { 540 return nil 541 } 542 } 543 544 return fmt.Errorf( 545 "client names %v are not authorized for service %q (%v)", 546 cert.DNSNames, serviceName, allowedClientNames) 547 } 548 549 // Ensure authInterceptor matches the serverInterceptor interface. 550 var _ serverInterceptor = (*authInterceptor)(nil)